In [3]:
import pandas as pd  # Pandas: توفر هياكل بيانات وأدوات تحليل البيانات (مثل DataFrame).
import numpy as np   # NumPy: يدعم المصفوفات والماتريكس متعددة الأبعاد والكبيرة، مع الوظائف الرياضية للتعامل مع هذه المصفوفات.
import matplotlib.pyplot as plt  # Matplotlib: مكتبة رسم بياني لإنشاء رسومات ثابتة وتفاعلية ومتحركة في Python.
import seaborn as sns  # Seaborn: تعتمد على Matplotlib وتوفر واجهة عالية المستوى لرسم رسوم بيانية إحصائية جذابة ومعلوماتية.


In [8]:
data = pd.read_csv("youtubers_df.csv")

In [9]:
# عرض أول 5 صفوف من DataFrame 'data' للحصول على نظرة عامة على هيكل ومحتويات مجموعة البيانات.
data.head()


Unnamed: 0,Rank,Username,Categories,Suscribers,Country,Visits,Likes,Comments,Links
0,1,tseries,Música y baile,253500000.0,India,111300.0,3200.0,120.0,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...
1,2,MrBeast,"Videojuegos, Humor",210100000.0,Estados Unidos,95400000.0,5400000.0,16600.0,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...
2,3,CoComelon,Educación,168100000.0,Unknown,4100000.0,21600.0,0.0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...
3,4,SETIndia,,165400000.0,India,23300.0,239.0,9.0,http://youtube.com/channel/UCpEhnqL0y41EpW2TvW...
4,5,KidsDianaShow,"Animación, Juguetes",116500000.0,Unknown,4100000.0,8800.0,0.0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...


# Data Cleaning

In [10]:
# احسب وعرض عدد القيم المفقودة في كل عمود من DataFrame 'data'.
data.isnull().sum()


Rank            0
Username        0
Categories    324
Suscribers      0
Country         0
Visits          0
Likes           0
Comments        0
Links           0
dtype: int64

### 1. تحليل عمود 'الفئات'
- **فهم طبيعة عمود 'الفئات'**: هل هي بيانات تصنيفية؟ هل الفئات ضرورية لتحليلك؟
- **تحديد سبب القيم المفقودة**: هل هي مفقودة فعلاً، أم أنها مُشفرة بطريقة ما (مثلاً، ممثلة كفراغات، 'NaN'، 'None'، إلخ.)؟
؟


### 2. اتخاذ قرار بشأن استراتيجية للتعامل مع البيانات المفقودة
- **الخيار أ: إزالة الصفوف التي تحتوي على 'فئات' مفقودة**: إذا كان عدد القيم المفقودة ليس كبيرًا بالمقارنة مع حجم مجموعة البيانات بأكملها وإذا كانت هذه القيم ليست حاسمة، فيمكنك إزالة هذه الصفوف.


### الخيار ب: ملء القيم المفقودة
- إذا كانت 'الفئات' مهمة، ففكر في ملء القيم المفقودة:
  - باستخدام قيمة عنصر نائب مثل 'غير معروف' أو 'أخرى'.
  - استنادًا إلى اتجاهات البيانات الأخرى (مثل أكثر الفئات شيوعًا، أو استخدام نموذج للتنبؤ بالفئة المفقودة).


# Practice

In [13]:
data.describe()

Unnamed: 0,Suscribers,Visits,Likes,Comments
count,1000.0,1000.0,1000.0,1000.0
mean,22609800.0,1063825.0,49171.48,1866.295
std,17339980.0,3729356.0,210715.6,20503.291953
min,12300000.0,0.0,0.0,0.0
25%,14300000.0,33100.0,479.25,3.0
50%,17300000.0,193200.0,3400.0,70.0
75%,24425000.0,884450.0,30800.0,458.25
max,253500000.0,95400000.0,5400000.0,532000.0


In [14]:
rows, columns = data.shape
print(f"The DataFrame has {rows} rows and {columns} columns.")

The DataFrame has 1000 rows and 9 columns.


In [15]:
number_of_rows = len(data)
print(f"The DataFrame has {number_of_rows} rows.")

The DataFrame has 1000 rows.


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rank        1000 non-null   object 
 1   Username    1000 non-null   object 
 2   Categories  676 non-null    object 
 3   Suscribers  1000 non-null   float64
 4   Country     1000 non-null   object 
 5   Visits      1000 non-null   float64
 6   Likes       1000 non-null   float64
 7   Comments    1000 non-null   float64
 8   Links       1000 non-null   object 
dtypes: float64(4), object(5)
memory usage: 70.4+ KB


# Option A: Fill missing 'Categories' with 'Unknown'
data['Categories'].fillna('Unknown', inplace=True)


In [17]:
# Option A: Remove rows with missing 'Categories'
data = data.dropna(subset=['Categories'])


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 676 entries, 0 to 999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rank        676 non-null    object 
 1   Username    676 non-null    object 
 2   Categories  676 non-null    object 
 3   Suscribers  676 non-null    float64
 4   Country     676 non-null    object 
 5   Visits      676 non-null    float64
 6   Likes       676 non-null    float64
 7   Comments    676 non-null    float64
 8   Links       676 non-null    object 
dtypes: float64(4), object(5)
memory usage: 52.8+ KB


In [22]:
data.loc[:, 'Rank'] = pd.to_numeric(data['Rank'], errors='coerce')


In [24]:
data.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop_duplicates(inplace=True)


In [25]:
data.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop_duplicates(inplace=True)


In [26]:
data.drop_duplicates(inplace=True, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop_duplicates(inplace=True, ignore_index=True)


In [27]:
data.drop_duplicates(inplace=True, ignore_index=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop_duplicates(inplace=True, ignore_index=True)


In [28]:
data = data.loc[~data.duplicated(keep='first'), :].reset_index(drop=True)


In [29]:
data['Country'] = data['Country'].str.strip()

In [30]:
data['Suscribers'] = data['Suscribers'].astype(int)
data['Visits'] = data['Visits'].astype(int)
data['Likes'] = data['Likes'].astype(int)
data['Comments'] = data['Comments'].astype(int)


In [31]:
data.to_csv('cleaned_data.csv', index=False)

In [32]:
df = pd.read_csv('cleaned_data.csv')

In [33]:
df.head()

Unnamed: 0,Rank,Username,Categories,Suscribers,Country,Visits,Likes,Comments,Links
0,1.0,tseries,Música y baile,253500000,India,111300,3200,120,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...
1,2.0,MrBeast,"Videojuegos, Humor",210100000,Estados Unidos,95400000,5400000,16600,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...
2,3.0,CoComelon,Educación,168100000,Unknown,4100000,21600,0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...
3,5.0,KidsDianaShow,"Animación, Juguetes",116500000,Unknown,4100000,8800,0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...
4,,PewDiePie,"Películas, Videojuegos",111400000,Estados Unidos,2900000,182000,8100,http://youtube.com/channel/UC-lHJZR3Gqxm24_Vd_...


In [34]:
data.dropna(subset=['Rank'], inplace=True)

In [35]:
df.head()

Unnamed: 0,Rank,Username,Categories,Suscribers,Country,Visits,Likes,Comments,Links
0,1.0,tseries,Música y baile,253500000,India,111300,3200,120,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...
1,2.0,MrBeast,"Videojuegos, Humor",210100000,Estados Unidos,95400000,5400000,16600,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...
2,3.0,CoComelon,Educación,168100000,Unknown,4100000,21600,0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...
3,5.0,KidsDianaShow,"Animación, Juguetes",116500000,Unknown,4100000,8800,0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...
4,,PewDiePie,"Películas, Videojuegos",111400000,Estados Unidos,2900000,182000,8100,http://youtube.com/channel/UC-lHJZR3Gqxm24_Vd_...


In [36]:
nan_rank_rows = data[data['Rank'],isna()]

NameError: name 'isna' is not defined

In [37]:
nan_rank_rows = data[data['Rank'].isna()]
print(nan_rank_rows)

Empty DataFrame
Columns: [Rank, Username, Categories, Suscribers, Country, Visits, Likes, Comments, Links]
Index: []


In [38]:
df.head()

Unnamed: 0,Rank,Username,Categories,Suscribers,Country,Visits,Likes,Comments,Links
0,1.0,tseries,Música y baile,253500000,India,111300,3200,120,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...
1,2.0,MrBeast,"Videojuegos, Humor",210100000,Estados Unidos,95400000,5400000,16600,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...
2,3.0,CoComelon,Educación,168100000,Unknown,4100000,21600,0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...
3,5.0,KidsDianaShow,"Animación, Juguetes",116500000,Unknown,4100000,8800,0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...
4,,PewDiePie,"Películas, Videojuegos",111400000,Estados Unidos,2900000,182000,8100,http://youtube.com/channel/UC-lHJZR3Gqxm24_Vd_...


In [39]:
data.to_csv('cleaned_data.csv', index=False)

In [40]:
df.head()

Unnamed: 0,Rank,Username,Categories,Suscribers,Country,Visits,Likes,Comments,Links
0,1.0,tseries,Música y baile,253500000,India,111300,3200,120,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...
1,2.0,MrBeast,"Videojuegos, Humor",210100000,Estados Unidos,95400000,5400000,16600,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...
2,3.0,CoComelon,Educación,168100000,Unknown,4100000,21600,0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...
3,5.0,KidsDianaShow,"Animación, Juguetes",116500000,Unknown,4100000,8800,0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...
4,,PewDiePie,"Películas, Videojuegos",111400000,Estados Unidos,2900000,182000,8100,http://youtube.com/channel/UC-lHJZR3Gqxm24_Vd_...


In [41]:
nan_rank_rows = data[data['Rank'].isna()]
print(nan_rank_rows)

Empty DataFrame
Columns: [Rank, Username, Categories, Suscribers, Country, Visits, Likes, Comments, Links]
Index: []


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rank        473 non-null    float64
 1   Username    676 non-null    object 
 2   Categories  676 non-null    object 
 3   Suscribers  676 non-null    int64  
 4   Country     676 non-null    object 
 5   Visits      676 non-null    int64  
 6   Likes       676 non-null    int64  
 7   Comments    676 non-null    int64  
 8   Links       676 non-null    object 
dtypes: float64(1), int64(4), object(4)
memory usage: 47.7+ KB


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 473 entries, 0 to 675
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rank        473 non-null    float64
 1   Username    473 non-null    object 
 2   Categories  473 non-null    object 
 3   Suscribers  473 non-null    int32  
 4   Country     473 non-null    object 
 5   Visits      473 non-null    int32  
 6   Likes       473 non-null    int32  
 7   Comments    473 non-null    int32  
 8   Links       473 non-null    object 
dtypes: float64(1), int32(4), object(4)
memory usage: 29.6+ KB


In [44]:
data.head()

Unnamed: 0,Rank,Username,Categories,Suscribers,Country,Visits,Likes,Comments,Links
0,1.0,tseries,Música y baile,253500000,India,111300,3200,120,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...
1,2.0,MrBeast,"Videojuegos, Humor",210100000,Estados Unidos,95400000,5400000,16600,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...
2,3.0,CoComelon,Educación,168100000,Unknown,4100000,21600,0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...
3,5.0,KidsDianaShow,"Animación, Juguetes",116500000,Unknown,4100000,8800,0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...
5,7.0,LikeNastyaofficial,Juguetes,110400000,Unknown,4100000,14500,0,http://youtube.com/channel/UCJplp5SjeGSdVdwsfb...


In [1]:
# First, identify the categories in the Categories columns. Note that some channels are more than one category.
cat_list = []
for i in file['Categories']:
    split_list = []
    split_list = i.split(',')
    for j in split_list:
        j = j.strip()
        cat_list.append(j)

# Now I isolate the unique categories, by using the set datatype:
cat_set = set(cat_list)


# Next, I convert the unique categories into a single string of each category, separated by commas.
# This is important to reduce the number of queries to the translation tool / URL to a minimum.
# The translate_text function throws an error if you query the same URL more than ~7 times in rapid succession.
cat_list = list(cat_set)
cat_str = ''
for i in cat_list:
    cat_str = cat_str + i + ','
    
# Now we translate:
english = ts.translate_text(cat_str)

# Then undo the single string back into a list:
english_list = english.split(',')

# Then make a dictionary to match the Spanish phrases to the English translations.
trans_dict = {}
for i,ii in enumerate(cat_list):
    trans_dict[ii] = english_list[i]
    
# Next we translate the column using the dictionary:
translation_list = []
for i in file['Categories']:
    entry = i.split(',')
    entry_list = []
    for j in entry:
        j = j.strip()
        j = trans_dict[j]
        entry_list.append(j)
    translation_list.append(entry_list)
file['Categories'] = translation_list

# Countries are much simpler, as they only have singular values per entry:
# A similar process is followed as above, with fewer steps. 
# Consult the comments above if you want to understand why each step is taken
country_list = list(file['Country'].unique())
country_str = ''
for i in country_list:
    country_str = country_str + i + ','
english_c = ts.translate_text(country_str)
english_list_c = english_c.split(',')
trans_dict = {}
for i,ii in enumerate(country_list):
    trans_dict[ii] = english_list_c[i]
file['Country'] = file['Country'].replace(trans_dict)

display(file.head())

NameError: name 'file' is not defined

In [2]:
file = pd.read_csv(youtubers_df.csv)

NameError: name 'pd' is not defined