In [59]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


In [71]:
# Загрузка датасета
df = pd.read_csv("Most popular 1000 Youtube videos.csv")


In [72]:
# Первичный анализ данных
print("Обзор данных:")
print(df.info())
print("\nПропущенные значения:")
print(df.isnull().sum())

Обзор данных:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         1000 non-null   int64 
 1   Video        1000 non-null   object
 2   Video views  1000 non-null   object
 3   Likes        1000 non-null   object
 4   Dislikes     527 non-null    object
 5   Category     982 non-null    object
 6   published    1000 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 54.8+ KB
None

Пропущенные значения:
rank             0
Video            0
Video views      0
Likes            0
Dislikes       473
Category        18
published        0
dtype: int64


# 1. Обработка пропусков в данных

In [68]:
# Заполняем пропуски медианным значением для числовых столбцов
num_cols = ['Dislikes']  
imputer = SimpleImputer(strategy='median')
df[num_cols] = imputer.fit_transform(df[num_cols])


In [73]:
# Преобразуем колонку 'Dislikes' в числовой формат
df['Dislikes'] = df['Dislikes'].str.replace(',', '').astype(float)

In [39]:
# Заполняем пропуски медианным значением
imputer = SimpleImputer(strategy='median')
df[['Dislikes']] = imputer.fit_transform(df[['Dislikes']])

In [74]:
print("\nПропущенные значения:")
print(df.isnull().sum())


Пропущенные значения:
rank             0
Video            0
Video views      0
Likes            0
Dislikes       473
Category        18
published        0
dtype: int64


In [75]:
# Заполняем пропуски в 'Category' наиболее частым значением
imputer_cat = SimpleImputer(strategy='most_frequent')
df[['Category']] = imputer_cat.fit_transform(df[['Category']])

In [76]:
print("\nПропущенные значения:")
print(df.isnull().sum())


Пропущенные значения:
rank             0
Video            0
Video views      0
Likes            0
Dislikes       473
Category         0
published        0
dtype: int64


# 2. Кодирование категориальных признаков

In [77]:
encoder = LabelEncoder()
df['Category_encoded'] = encoder.fit_transform(df['Category'])


In [78]:
print(df.head())


   rank                                              Video    Video views  \
0     1  Lil Nas X - Old Town Road (Official Movie) ft....     54,071,677   
1     2  20 Tennis shots if they were not filmed, NOBOD...      3,471,237   
2     3                 JoJo Siwa - Karma (Official Video)     34,206,747   
3     4    David Kushner - Daylight (Official Music Video)     18,558,390   
4     5  Wiz Khalifa - See You Again ft. Charlie Puth [...  6,547,981,039   

        Likes  Dislikes Category  published  Category_encoded  
0   3,497,955   78799.0    Music       2019                 7  
1      19,023     859.0    Music       2017                 7  
2     293,563       NaN    Music       2024                 7  
3     680,732       NaN    Music       2023                 7  
4  44,428,537       NaN    Music       2015                 7  


# 3. Масштабирование числовых данных

In [79]:
df['Likes'] = df['Likes'].str.replace(',', '').astype(float)
df['Video views'] = df['Video views'].str.replace(',', '').astype(float)


num_cols = ['Video views', 'Likes', 'Dislikes']
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Проверяем обработанные данные
print("\nОбработанный датасет:")
print(df.head())

# Сохранение обработанного датасета
df.to_csv("processed_dataset.csv", index=False)



Обработанный датасет:
   rank                                              Video  Video views  \
0     1  Lil Nas X - Old Town Road (Official Movie) ft....     0.008251   
1     2  20 Tennis shots if they were not filmed, NOBOD...     0.000523   
2     3                 JoJo Siwa - Karma (Official Video)     0.005217   
3     4    David Kushner - Daylight (Official Music Video)     0.002827   
4     5  Wiz Khalifa - See You Again ft. Charlie Puth [...     1.000000   

      Likes  Dislikes Category  published  Category_encoded  
0  0.078723  0.442587    Music       2019                 7  
1  0.000418  0.004825    Music       2017                 7  
2  0.006598       NaN    Music       2024                 7  
3  0.015312       NaN    Music       2023                 7  
4  1.000000       NaN    Music       2015                 7  
