In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/1000-most-subscribed-youtube-channels-in-germany/top-1000-most-subscribed-youtube-channels-in-germany.csv


### Import the Libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### Load the Dataset

In [3]:
df=pd.read_csv('/kaggle/input/1000-most-subscribed-youtube-channels-in-germany/top-1000-most-subscribed-youtube-channels-in-germany.csv')

### See the first 5 rows of dataset

In [4]:
df.head()

Unnamed: 0,rank,Youtuber,subscribers,video views,video count,category,started
0,1,Tsuriki Show,34100000,42490526838,4739,Entertainment,2019
1,2,Kidibli (Kinder Spielzeug Kanal),29600000,15673364837,1236,Entertainment,2015
2,3,Kurzgesagt – In a Nutshell,23600000,3145706013,271,Education,2013
3,4,boxtoxtv,23500000,18303986629,1559,Comedy,2022
4,5,HaerteTest,19500000,3420864412,1712,Science & Technology,2011


### Check the null value in dataset

In [5]:
df.isnull().sum()

rank           0
Youtuber       0
subscribers    0
video views    0
video count    0
category       0
started        0
dtype: int64

### Check the datatype of dataset

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         1000 non-null   int64 
 1   Youtuber     1000 non-null   object
 2   subscribers  1000 non-null   object
 3   video views  1000 non-null   object
 4   video count  1000 non-null   object
 5   category     1000 non-null   object
 6   started      1000 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


### Convert 'subscribers', 'video views', 'video count' to numeric

In [8]:
df['subscribers'] = df['subscribers'].str.replace(',', '').astype(float)
df['video views'] = df['video views'].str.replace(',', '').astype(float)
df['video count'] = df['video count'].str.replace(',', '').astype(float)

### Encode categorical column ('category') using Label Encoding

In [9]:
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])

### Define features (X) and target (y)


In [10]:
X = df.drop(columns=['rank', 'Youtuber'])  # Drop non-numeric and ID-like columns
y = df['subscribers']  # Predicting 'subscribers'

### Normalize data

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Split dataset (80% train, 20% test)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### Initialize models

In [13]:
models = {
    "Support Vector Machine": SVR(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42)
}

### Train and evaluate models


In [14]:
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {"MSE": mse, "MAE": mae, "R2 Score": r2}

### Convert results to DataFrame and display


In [15]:
results_df = pd.DataFrame(results).T
print(results_df)

                                 MSE            MAE  R2 Score
Support Vector Machine  1.751130e+12  707332.011101 -0.097739
Random Forest           5.303842e+09    9479.360000  0.996675
Decision Tree           1.615809e+09    9219.360000  0.998987
