In [33]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

: 

In [34]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# 1-Exploratory Data Analysis (EDA)

### Import Libraries 

In [35]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

### Load the Dataset

In [36]:
df = pd.read_csv('/kaggle/input/youtube-subscribers-data-2024/youtube_subscribers_data.csv')

### Show first 5 rows

In [37]:
df.head()

### Check the total rows and columns

In [38]:
df.shape

### Check the Column Header

In [39]:
df.columns.unique()

### Check the unique entry of every column

In [40]:
df.Country.unique()

In [41]:
df.Category.unique()

In [42]:
df['Primary language'].unique()

In [43]:
df['Name'].unique()

In [44]:
df['Brand channel'].unique()

In [45]:
df['Subscribers (millions)'].unique()

In [46]:
df[df['Country']=='Pakistan']

### Checking for Null Values in a Dataset  

In [47]:
df.isnull().sum()

### Check the datatype of each column

In [48]:
df.info()

### Summary statistics of dataset

In [49]:
df.describe()

## Visualizations

#### histograms visualization

In [50]:
df['Subscribers (millions)'].hist(bins=10, color='skyblue', edgecolor='black')
plt.title('Distribution of Subscribers (millions)')
plt.xlabel('Subscribers (millions)')
plt.ylabel('Frequency')
plt.show()

#### Box Plot visualization

In [51]:
sns.boxplot(x=df['Subscribers (millions)'])
plt.title('Box Plot for Subscribers')
plt.show()

#### Scatter Plot Visualization

In [52]:
sns.scatterplot(x=df.index, y=df['Subscribers (millions)'])
plt.title('Subscribers Scatter Plot')
plt.xlabel('Index')
plt.ylabel('Subscribers (millions)')
plt.show()

## Correlation Analysis

In [53]:
correlation = df.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## Outlier Detection

In [54]:
Q1 = df['Subscribers (millions)'].quantile(0.25)
Q3 = df['Subscribers (millions)'].quantile(0.75)
IQR = Q3 - Q1

outliers = df[(df['Subscribers (millions)'] < Q1 - 1.5 * IQR) | (df['Subscribers (millions)'] > Q3 + 1.5 * IQR)]
print("Outliers:\n", outliers)

## Feature Distribution Analysis

In [55]:
df['Category'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

## Grouped Aggregations

#### Analyze Subscribers (millions) by Category.

In [56]:
grouped = df.groupby('Category')['Subscribers (millions)'].sum()
print(grouped)
grouped.plot(kind='bar', color='skyblue')
plt.title('Total Subscribers by Category')
plt.ylabel('Subscribers (millions)')
plt.show()


## Insights from Relationships Between Features

In [57]:
sns.pairplot(df, hue='Category')
plt.title('Pairwise Analysis')
plt.show()

## Additional Analyses

#### Top Channels by Subscribers

In [58]:
top_channels = df.nlargest(5, 'Subscribers (millions)')
print("Top Channels:\n", top_channels)


#### Country-wise Distribution

In [59]:
plt.figure(figsize=(10, 8)) 
df['Country'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Country-wise Channel Distribution')
plt.show()

# Data Preprocessing

### Handle Missing Values

In [60]:
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
for column in categorical_columns:
    df[column] = df[column].fillna(df[column].mode()[0])

### Encode Categorical Variables

In [61]:
df_encoded = pd.get_dummies(df, columns=['Brand channel', 'Primary language', 'Category', 'Country'], drop_first=True)

In [62]:
df_encoded

### Scale or Normalize Numerical Features

In [63]:
scaler = StandardScaler()
df_encoded['Subscribers (millions)'] = scaler.fit_transform(df_encoded[['Subscribers (millions)']])

In [64]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('Subscribers (millions)', axis=1)
y = df_encoded['Subscribers (millions)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Machine Learning Model

In [65]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

label_encoder = LabelEncoder()

df_encoded = df.copy()
df_encoded['Brand channel'] = label_encoder.fit_transform(df['Brand channel'])
df_encoded['Primary language'] = label_encoder.fit_transform(df['Primary language'])
df_encoded['Category'] = label_encoder.fit_transform(df['Category'])
df_encoded['Country'] = label_encoder.fit_transform(df['Country'])

X = df_encoded.drop(columns=['Subscribers (millions)', 'Name'])  # drop 'Name' since it's just an identifier
y = df_encoded['Subscribers (millions)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")