<a href="https://colab.research.google.com/github/JuliustheCreator/channel-growth-model/blob/main/analysis/youtube_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Importing Modules**


---



In [1]:
import pandas as pd
import tensorflow as tf
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
#from google.colab import files




#### **Importing Cleaned Dataset**


---



In [2]:
import os
import pandas as pd

data_path = "D:/ALL_PROJECTS-MAIN/UTUBE_ANALYSIS/channel-growth-model/data/topSubscribed.csv"    # use forward slashes (works on Windows too)
# or: data_path = r"data\topSubscribed.csv"   # raw string to keep backslash
# or: data_path = "data\\topSubscribed.csv"   # escape backslash

print("exists:", os.path.exists(data_path))
df = pd.read_csv(data_path)
print(df.head())


exists: True
   Rank             Youtube Channel  Subscribers      Video Views Video Count  \
0     1                    T-Series  234,000,000  212,900,271,553      18,515   
1     2              YouTube Movies  161,000,000                0           0   
2     3  Cocomelon - Nursery Rhymes  152,000,000  149,084,178,448         846   
3     4                   SET India  150,000,000  137,828,094,104     103,200   
4     5                     MrBeast  128,000,000   21,549,128,785         733   

           Category  Started  
0             Music     2006  
1  Film & Animation     2015  
2         Education     2006  
3             Shows     2006  
4     Entertainment     2012  


In [19]:
print(df.columns)


Index(['Rank', 'Youtube Channel', 'Subscribers', 'Video Views', 'Video Count',
       'Category', 'Started'],
      dtype='object')


### **Cleaning Dataset Further for Model**

In [20]:
import numpy as np
import pandas as pd
from datetime import datetime

# Compute Age of the channel from 'Started' year
current_year = datetime.now().year
df['Age'] = current_year - df['Started']

# Remove channels (rows) where there are no videos or no views
df = df.drop(df[(df['Video Views'] == 0) | (df['Video Count'] == 0)].index)

# Augmenting dataset (adding dummy channels)
for i in range(250):
    # Randomizing age around the mean
    age = np.random.normal(loc=df['Age'].mean(), scale=df['Age'].std())

    dummy_channel = pd.DataFrame({
        'Youtube Channel': ['Dummy Channel'],
        'Subscribers': [0],
        'Video Views': [0],
        'Video Count': [0],
        'Category': ['Unknown'],
        'Started': [current_year - int(age)],  # reverse-calculate a start year
        'Rank': [None],
        'Age': [age]
    })

    df = pd.concat([df, dummy_channel], ignore_index=True)


In [21]:
# Removing channels (rows) where there are no videos or no views
df = df.drop(df[(df['Video Views'] == 0) | (df['Video Count'] == 0)].index)

# Augmenting dataset (adding dummy channels)
for i in range(250):

  # Randomizing age around the mean
  age = np.random.normal(loc = df['Age'].mean(), scale = df['Age'].std())

  dummy_channel = pd.DataFrame({'Youtube Channel': ['Dummy Channel'], 'Subscribers':[0], 'Video Views':[0], 'Video Count':[0], 'Age':[int(age)]})

  df = pd.concat([df, dummy_channel], ignore_index = True)

### **Building the Regression Model (Neural Network)**


---



In [24]:
cols_to_clean = ['Video Views', 'Video Count', 'Subscribers']
for col in cols_to_clean:
    df[col] = df[col].astype(str).str.replace(',', '').astype(float)

In [25]:
# Selecting required columns
X = df[['Video Views', 'Video Count', 'Age']]
y = df['Subscribers']
# Clean numeric columns (remove commas and convert to numbers)



# Scaling the features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 220)

# Creating Model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation = 'relu', input_shape = [X_train.shape[1]]),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### **Training the Model**


---



In [26]:
model.compile(optimizer = 'adam', loss = 'mean_squared_error')

history = model.fit(X_train, y_train, epochs = 100, batch_size = 50, validation_split = 0.2)

Epoch 1/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 559894621061120.0000 - val_loss: 883318274916352.0000
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 635274853875712.0000 - val_loss: 883318274916352.0000
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 614300179759104.0000 - val_loss: 883318274916352.0000
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 642434161704960.0000 - val_loss: 883318207807488.0000
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 551202043461632.0000 - val_loss: 883318207807488.0000
Epoch 6/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 480385179844608.0000 - val_loss: 883317939372032.0000
Epoch 7/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 546651827601408.0000 - 

### **Evaluating the Model**


---



In [27]:
loss = model.evaluate(X_test, y_test)
print(f"Mean squared error: {loss}")

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 473215805685760.0000 
Mean squared error: 466938341883904.0


### **Exporting and Testing Model**


---



In [30]:
# To Test the Model with Custom Values
views = float(input('Input Video Views: ')) / 1_000_000
videos = float(input('Input Video Count: ')) / 1000
age = float(input('Input Age of Channel: '))

# Prediction
test_values = scaler.transform(np.array([[views, videos, age]]))

predicted_subscribers = model.predict(test_values)
print(f"Estimated Subscriber Count: {int(predicted_subscribers[0][0]) * 1_000_000:,}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Estimated Subscriber Count: 18,661,000,000


