## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

## Importing the dataset over here

In [2]:
data=pd.read_csv("all_stats_t_series.csv")

In [3]:
data.head()

Unnamed: 0,id,viewCount,likeCount,favoriteCount,commentCount
0,c_VrTcIY8kA,233296,82028.0,0,13239.0
1,GHyHfi7fnGQ,81592,4432.0,0,535.0
2,SIN9YUhH6cU,31523,1436.0,0,209.0
3,tPRge7PMVeM,45149,997.0,0,127.0
4,UlrZOaT8ays,26186,2108.0,0,42.0


## Taking care of duplicate observations if present over here

In [5]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [6]:
data.isnull().sum()

id                0
viewCount         0
likeCount        14
favoriteCount     0
commentCount     16
dtype: int64

In [7]:
data.dropna(inplace=True)

## Filtering all the numerical features over here

In [8]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

viewCount
likeCount
favoriteCount
commentCount


In [9]:
data[numerical_features]

Unnamed: 0,viewCount,likeCount,favoriteCount,commentCount
0,233296,82028.0,0,13239.0
1,81592,4432.0,0,535.0
2,31523,1436.0,0,209.0
3,45149,997.0,0,127.0
4,26186,2108.0,0,42.0
...,...,...,...,...
19853,1614,13.0,0,0.0
19854,47818,76.0,0,6.0
19855,3756,30.0,0,2.0
19856,116365,823.0,0,15.0


## Filtering all the categorical features over here

In [10]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

id


In [11]:
data[cat_features]

Unnamed: 0,id
0,c_VrTcIY8kA
1,GHyHfi7fnGQ
2,SIN9YUhH6cU
3,tPRge7PMVeM
4,UlrZOaT8ays
...,...
19853,8r0wy9zP0oU
19854,UWdnsdc6Bkc
19855,ZlGlZGIomZM
19856,bo9EkevW6iw


## Encoding the categorical features into numerical features over here

In [12]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [13]:
data

Unnamed: 0,id,viewCount,likeCount,favoriteCount,commentCount
0,0,233296,82028.0,0,13239.0
1,1,81592,4432.0,0,535.0
2,2,31523,1436.0,0,209.0
3,3,45149,997.0,0,127.0
4,4,26186,2108.0,0,42.0
...,...,...,...,...,...
19853,19824,1614,13.0,0,0.0
19854,19825,47818,76.0,0,6.0
19855,19826,3756,30.0,0,2.0
19856,19827,116365,823.0,0,15.0


## Creating the features and labels over here

In [14]:
data['LIKECOUNT']=data['likeCount']
data.drop(['likeCount'],axis=1,inplace=True)

In [15]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the Training dataset over here

In [19]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [20]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[275272.7 174510. ]
 [ 14275.6  16228. ]
 [   773.2    303. ]
 ...
 [  1758.5   1638. ]
 [ 13216.    5218. ]
 [ 86784.6 155143. ]]


## Checking the performance of the model using r2_score metrics over here

In [21]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9326626715463638