In [1]:
# Making all necessary imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
from tqdm import tqdm

In [2]:
# Importing the generated dataset and dropping columns with null values.

dfvideos = pd.read_csv("Videos_DF1.csv")
dfvideos = dfvideos.dropna()

dfvideos.head()

Unnamed: 0.1,Unnamed: 0,title,description,keywords,channel_links,video,date,length,views
0,0,I got the Fortnite Only Up WORLD RECORD! (Spee...,⬆️ PLAY MY ONLY UP MAP NOW!! ► 5264-1761-9807❤...,"video, sharing, camera phone, video phone, fre...",https://www.youtube.com/@TGplays,https://www.youtube.com/watch?v=4HlBgHmknY4,2023-07-12T19:36:52-07:00,PT19M55S,1.2M
1,1,Ron DeSantis: It is important to stand for a c...,2024 GOP presidential candidate Gov. Ron DeSan...,"DeSantis, Ron DeSantis, DeSantis abortion, abo...",https://www.youtube.com/@FoxNews,https://www.youtube.com/watch?v=pWpOn6C0YAk,2024-01-09T16:14:58-08:00,PT5M23S,26K
2,2,"Game Theory: Viewers' Choice, Cyborgs, Fatalit...",Your voices have been heard! As thanks for sup...,"Chrono Trigger, Mario, Super Mario, Illusion o...",https://www.youtube.com/@GameTheory,https://www.youtube.com/watch?v=z4QwsHsu3uw,2011-07-06T09:05:29-07:00,PT8M27S,958K
3,3,C-R-O-W-N-E-D - Kirby's Return to Dream Land +...,MY LINKS:●Main channel: https://www.youtube.co...,"music, extended, ost",https://www.youtube.com/@AacroXtensions,https://www.youtube.com/watch?v=iPx1YkOVGKE,2023-04-26T17:51:46-07:00,PT30M1S,1.2K
4,4,Tostarena: Night - Super Mario Odyssey Music E...,MY LINKS:●Main channel: https://www.youtube.co...,"music, extended, ost",https://www.youtube.com/@AacroXtensions,https://www.youtube.com/watch?v=xYY8KI_00tY,2023-06-28T22:01:18-07:00,PT30M2S,3.1K


In [3]:
from gensim.models import Word2Vec

# Defining the word2vec function
# Tokenizing the text in each column

title_tokens = [str(title).split() for title in dfvideos['title']]
description_tokens = [str(description).split() for description in dfvideos['description']]
keywords_tokens = [str(keywords).split() for keywords in dfvideos['keywords']]

# Training Word2Vec model
word2vec_model = Word2Vec(title_tokens + description_tokens + keywords_tokens, vector_size=500, window=5, min_count=1, workers=4)

def vectorize_with_word2vec(tokens, model):
    vectors = []
    for token_list in tokens:
        vector = sum([model.wv[word] for word in token_list if word in model.wv])
        vectors.append(vector)
    return vectors


# Applying Word2Vec to each column and add vectors to DataFrame
dfvideos['title_vectors'] = vectorize_with_word2vec(title_tokens, word2vec_model)
dfvideos['description_vectors'] = vectorize_with_word2vec(description_tokens, word2vec_model)
dfvideos['keywords_vectors'] = vectorize_with_word2vec(keywords_tokens, word2vec_model)


In [None]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Combining the text to use in TF-IDF

dfvideos['combined_text'] = dfvideos['title'] + ' ' + dfvideos['description'] + ' ' + dfvideos['keywords']

dfvideos['combined_text'].fillna('', inplace=True)


# Vectorizing and dropping variables that are no longer usefull

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(dfvideos['combined_text'])

tfidf_csr = csr_matrix(tfidf_matrix)

n_components = 5000

svd = TruncatedSVD(n_components=n_components)
tfidf_reduced = svd.fit_transform(tfidf_matrix)

del tfidf_matrix, tfidf_vectorizer

# Adding the reduced TF-IDF vectors to my DataFrame, 
# then dropping the vectors to save space

dfvideos['tfidf_vector'] = tfidf_reduced.tolist()


del tfidf_csr

In [None]:
# Creating functions to convert the views and duration (length)
# to a usable numeric value.

def convert_views(view_count):
    if 'K' in view_count:
        return float(view_count.replace('K', '').replace(',', '')) * 1000
    elif 'M' in view_count:
        return float(view_count.replace('M', '').replace(',', '')) * 1_000_000
    elif 'B' in view_count:
        return float(view_count.replace('B', '').replace(',', '')) * 1_000_000_000
    else:
        try:
            return float(view_count.replace(',', ''))
        except:
            return np.nan


def convert_video_length(length):
    minutes, seconds = 0, 0

    try:
        # Extracting minutes
        if 'M' in length:
            minutes = int(length.split('M')[0][2:])

        # Extracting seconds
        if 'S' in length:
            seconds = int(length.split('S')[0][-2:])

        # Calculating total seconds
        total_seconds = minutes * 60 + seconds
        return total_seconds
    
    except:
        return np.nan
    
# Applying functions to the dataframe
dfvideos['views'] = dfvideos['views'].apply(convert_views)
dfvideos['length'] = dfvideos['length'].apply(convert_video_length)

In [None]:
# Removing the youtube link from the channel_links column
# Since it repeats in every entry

dfvideos['channel_links'] = dfvideos['channel_links'].apply(lambda x: x[24:])

In [None]:
# Converting date to a usable format

dfvideos['date'] = pd.to_datetime(dfvideos['date'], utc=True)

dfvideos['year'] = dfvideos['date'].dt.year
dfvideos['month'] = dfvideos['date'].dt.month
dfvideos['day'] = dfvideos['date'].dt.day
dfvideos['hour'] = dfvideos['date'].dt.hour
dfvideos['minute'] = dfvideos['date'].dt.minute


dfvideos['day_of_week'] = dfvideos['date'].dt.dayofweek

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Preparing MultiLabelBinarizer, encoding the channels to
# A useful format, for use as category.

mlb = MultiLabelBinarizer()

channel_matrix = mlb.fit_transform(dfvideos['channel_links'].str.split())

dfvideos['channel_vector'] = channel_matrix.tolist()

dfvideos['channel_vector']

0        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
2        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
3        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                               ...                        
17007    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
17008    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
17009    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
17010    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
17011    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
Name: channel_vector, Length: 16934, dtype: object

In [None]:
# Selecting relevant numeric features for use in regression

features = dfvideos.columns[9:]

X = dfvideos.dropna()[features].drop(columns='combined_text')
X = X.reset_index(drop=True)
y = dfvideos.dropna()["views"]

KeyError: "['combined_text'] not found in axis"

In [None]:
# Concatenating the vectors to the X variable, necessary
# for use in the regression algorithm

vector_columns = ['title_vectors', 'description_vectors', 'keywords_vectors', 'channel_vector', 'tfidf_vector']

for col in vector_columns:
    print(col)
    rows = pd.DataFrame(list(X[col].values))
    rows.columns = rows.columns.astype(str) + "_" + col
    X = pd.concat([X, rows], axis= 1)

X.drop(columns= vector_columns, inplace= True)

X

In [None]:
from sklearn.preprocessing import StandardScaler

# Scaling features, in case its needed.

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

# First trying ridge regression.


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Ridge(alpha=2.5)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


mse = r2_score(y_test, y_pred)
print(f'r² score: {np.sqrt(mse)}')
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)

In [9]:
from sklearn.ensemble import RandomForestRegressor

# Trying RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)


rf_model = RandomForestRegressor(n_estimators=200, criterion="poisson", random_state=42, verbose= 2, n_jobs= 10)

rf_model.fit(X_train, y_train)

y_pred = model.predict(X_test)


mse = r2_score(y_test, y_pred)
print(f'r² score: {np.sqrt(mse)}')
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)

NameError: name 'train_test_split' is not defined

In [10]:
import xgboost as xgb


# Trying xgboost regressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                          max_depth = 5, alpha = 10, n_estimators = 100, verbosity = 2, random_state=42)

xg_reg.fit(X_train, y_train)

y_pred = xg_reg.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R² Score (XGBoost): {r2}")

NameError: name 'train_test_split' is not defined