In [389]:
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from numpy import vstack
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_recall_fscore_support, classification_report,confusion_matrix, precision_recall_curve

# FOR CHANNEL DATA

In [390]:
df_channel=pd.read_csv('channel_data.csv',sep='\t')
df_channel['target']=1
print(df_channel.shape)
df_channel.head(2)

(7847, 7)


Unnamed: 0,hidden_subscriber_count,video_count,view_count,subscriber_count,comment_count,channel_id,target
0,0,41,47614,428,0,UCzc3mk_DhB09J9k9sGjxXeQ,1
1,0,0,0,18,0,UCnGXyNWs0f6sEBcbA7gLnCw,1


In [391]:
X_train, X_test, y_train, y_test = train_test_split(df_channel.drop(columns=['target','channel_id'],axis=1),df_channel['target'], test_size=0.2, random_state=42)

# SVM

In [395]:
def svm_model(X_train,X_test,y_train,y_test,split,contamination):
    
    
    #X_train, X_test, y_train, y_test = train_test_split(df_channel.drop(columns=['target','channel_id'],axis=1),df_channel['target'], test_size=split, random_state=42)

    
    print("results for splitting ",1-split,split)
    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
    print(X_train.columns)
    print(X_train.head(2))
    print("nu is ",contamination)
    
    oneclass = svm.OneClassSVM(kernel='poly', gamma=1, nu=contamination,degree=4)

    oneclass.fit(X_train)
   
    fraud_pred = oneclass.predict(X_test)
    
    print("unique fraud pred",np.unique(fraud_pred))
    
    tn, fp, fn, tp = confusion_matrix(y_test,fraud_pred).ravel()
    
    print("true negative: ",tn,"false positive",fp,"false negative",fn,"true positive",tp)
    
    print(classification_report(y_test,fraud_pred))
    print("Recall is ",recall_score(y_test,fraud_pred, average=None))
    #print("Recall-micro is ",recall_score(y_test,fraud_pred, average='micro'))
    #print("Recall-macro is ",recall_score(y_test,fraud_pred, average='macro'))
    


In [396]:
X_train.columns

Index(['hidden_subscriber_count', 'video_count', 'view_count',
       'subscriber_count', 'comment_count'],
      dtype='object')

In [397]:
svm_model(X_train,X_test,y_train,y_test,0.2,0.1)


results for splitting  0.8 0.2
(6277, 5) (1570, 5) (6277,) (1570,)
Index(['hidden_subscriber_count', 'video_count', 'view_count',
       'subscriber_count', 'comment_count'],
      dtype='object')
      hidden_subscriber_count  video_count  view_count  subscriber_count  \
2031                        0            7         171                27   
1219                        0            9          23                 1   

      comment_count  
2031              0  
1219              0  
nu is  0.1
unique fraud pred [-1  1]
true negative:  0 false positive 0 false negative 141 true positive 1429
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.91      0.95      1570

    accuracy                           0.91      1570
   macro avg       0.50      0.46      0.48      1570
weighted avg       1.00      0.91      0.95      1570

Recall is  [0.         0.91019108]


# ISOLATION FOREST

In [31]:
def isolation_forest(X_train,X_test,y_train,y_test,split,contamination):
    
    #X_train, X_test, y_train, y_test = train_test_split(df_channel.drop(columns=['target','channel_id'],axis=1),df_channel['target'], test_size=split, random_state=42)
    print("results for splitting ",1-split,split)
    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
    
    print("contamination is ",contamination)
    isolation_model = IsolationForest(contamination=contamination)
    isolation_model.fit(X_train)
    fraud_pred = isolation_model.predict(X_test)
    
    print("unique fraud pred",np.unique(fraud_pred))
    
    tn, fp, fn, tp = confusion_matrix(y_test,fraud_pred).ravel()
    
    print("true negative: ",tn,"false positive",fp,"false negative",fn,"true positive",tp)
    
    print(classification_report(y_test,fraud_pred))
    print("Recall is ",recall_score(y_test,fraud_pred, average=None))

    

      
    

In [32]:
isolation_forest(X_train,X_test,y_train,y_test,0.2,0.1)


results for splitting  0.8 0.2
(6277, 5) (1570, 5) (6277,) (1570,)
contamination is  0.1
unique fraud pred [-1  1]
true negative:  0 false positive 0 false negative 147 true positive 1423
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.91      0.95      1570

    accuracy                           0.91      1570
   macro avg       0.50      0.45      0.48      1570
weighted avg       1.00      0.91      0.95      1570

Recall is  [0.         0.90636943]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
def mcd_model(X_train,X_test,y_train,y_test,split,contamination):
   
        
    X_train=X_train.fillna(0)

    X_test=X_test.fillna(0)
    print("results for splitting ",1-split,split)
    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
    
    print("contamination is ",contamination)
    mcd_model = EllipticEnvelope(contamination=contamination)
    mcd_model.fit(X_train)
    fraud_pred = mcd_model.predict(X_test)
    
    print("unique fraud pred",np.unique(fraud_pred))
    
    tn, fp, fn, tp = confusion_matrix(y_test,fraud_pred).ravel()
    
    print("true negative: ",tn,"false positive",fp,"false negative",fn,"true positive",tp)
    
    print(classification_report(y_test,fraud_pred))
    print("Recall is ",recall_score(y_test,fraud_pred, average=None))

    
    
    

In [36]:
mcd_model(X_train,X_test,y_train,y_test,0.2,0.1)

results for splitting  0.8 0.2
(6277, 5) (1570, 5) (6277,) (1570,)
contamination is  0.1




unique fraud pred [-1  1]
true negative:  0 false positive 0 false negative 155 true positive 1415
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.90      0.95      1570

    accuracy                           0.90      1570
   macro avg       0.50      0.45      0.47      1570
weighted avg       1.00      0.90      0.95      1570

Recall is  [0.         0.90127389]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
def lof_predict(model, trainX, testX):
	# create one large dataset
	composite = vstack((trainX, testX))
	# make prediction on composite dataset
	yhat = model.fit_predict(composite)
	# return just the predictions on the test set
	return yhat[len(trainX):]

In [40]:
def lof_model(X_train,X_test,y_train,y_test,split,contamination):
    
        
    X_train=X_train.fillna(0)

    X_test=X_test.fillna(0)
    
    print("results for splitting ",1-split,split)
    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
    
    print("contamination is ",contamination)
    lof_model = LocalOutlierFactor(contamination=contamination)
    lof_model.fit(X_train)
    fraud_pred = lof_predict(lof_model, X_train,X_test)
    
    print("unique fraud pred",np.unique(fraud_pred))
    
    tn, fp, fn, tp = confusion_matrix(y_test,fraud_pred).ravel()
    
    print("true negative: ",tn,"false positive",fp,"false negative",fn,"true positive",tp)
    
    print(classification_report(y_test,fraud_pred))
    print("Recall is ",recall_score(y_test,fraud_pred, average=None))

    
    

In [41]:
lof_model(X_train,X_test,y_train,y_test,0.2,0.1)

results for splitting  0.8 0.2
(6277, 5) (1570, 5) (6277,) (1570,)
contamination is  0.1
unique fraud pred [-1  1]
true negative:  0 false positive 0 false negative 166 true positive 1404
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.89      0.94      1570

    accuracy                           0.89      1570
   macro avg       0.50      0.45      0.47      1570
weighted avg       1.00      0.89      0.94      1570

Recall is  [0.         0.89426752]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# For Likes data

In [398]:
df=pd.read_csv('dummy_likedata.csv',sep='\t')
df.columns

df.drop(columns=['Unnamed: 0','comment_count', 'description', 'duration', 'genre',
       'is_family_friendly', 'is_paid', 'is_unlisted', 'title', 'upload_date','dislikes', 'likes', 'views', 'channel_id', 'is_verified', 'name','video_days'],axis=1,inplace=True)

df['Target']=1
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Target','id'],axis=1),df['Target'], test_size=0.2, random_state=42)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

print(df.columns)

X_train=X_train.fillna(0)

X_test=X_test.fillna(0)


Index(['id', 'Activeness', 'duration2', 'Favorability', 'daysRate', 'Target'], dtype='object')


In [399]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((36457, 4), (9115, 4), (36457,), (9115,))

In [400]:
def svm_model2(X_train,X_test,y_train,y_test,split,contamination):
    
    
    #X_train, X_test, y_train, y_test = train_test_split(df_channel.drop(columns=['target','channel_id'],axis=1),df_channel['target'], test_size=split, random_state=42)

    
    print("results for splitting ",1-split,split)
    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
    print(X_train.columns)
    print(X_train.head(2))
    print("nu is ",contamination)
    
    oneclass = svm.OneClassSVM(kernel='poly', gamma=0.001, nu=contamination)

    oneclass.fit(X_train)
   
    fraud_pred = oneclass.predict(X_test)
    
    print("unique fraud pred",np.unique(fraud_pred))
    
    tn, fp, fn, tp = confusion_matrix(y_test,fraud_pred).ravel()
    
    print("true negative: ",tn,"false positive",fp,"false negative",fn,"true positive",tp)
    
    print(classification_report(y_test,fraud_pred))
    print("Recall is ",recall_score(y_test,fraud_pred, average=None))


In [401]:
svm_model2(X_train,X_test,y_train,y_test,0.2,0.1)


results for splitting  0.8 0.2
(36457, 4) (9115, 4) (36457,) (9115,)
Index(['Activeness', 'duration2', 'Favorability', 'daysRate'], dtype='object')
       Activeness  duration2  Favorability   daysRate
7720     0.012022         17      0.109091  11.385475
30585    0.160000          5      0.058824   0.183486
nu is  0.1
unique fraud pred [-1  1]
true negative:  0 false positive 0 false negative 807 true positive 8308
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.91      0.95      9115

    accuracy                           0.91      9115
   macro avg       0.50      0.46      0.48      9115
weighted avg       1.00      0.91      0.95      9115

Recall is  [0.         0.91146462]


In [49]:
isolation_forest(X_train,X_test,y_train,y_test,0.2,0.1)


results for splitting  0.8 0.2
(36457, 4) (9115, 4) (36457,) (9115,)
contamination is  0.1
unique fraud pred [-1  1]
true negative:  0 false positive 0 false negative 888 true positive 8227
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.90      0.95      9115

    accuracy                           0.90      9115
   macro avg       0.50      0.45      0.47      9115
weighted avg       1.00      0.90      0.95      9115

Recall is  [0.         0.90257817]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
mcd_model(X_train,X_test,y_train,y_test,0.2,0.1)

results for splitting  0.8 0.2
(36457, 4) (9115, 4) (36457,) (9115,)
contamination is  0.1
unique fraud pred [-1  1]
true negative:  0 false positive 0 false negative 908 true positive 8207
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.90      0.95      9115

    accuracy                           0.90      9115
   macro avg       0.50      0.45      0.47      9115
weighted avg       1.00      0.90      0.95      9115

Recall is  [0.         0.90038398]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
lof_model(X_train,X_test,y_train,y_test,0.2,0.1)

results for splitting  0.8 0.2
(36457, 4) (9115, 4) (36457,) (9115,)
contamination is  0.1
unique fraud pred [-1  1]
true negative:  0 false positive 0 false negative 918 true positive 8197
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.90      0.95      9115

    accuracy                           0.90      9115
   macro avg       0.50      0.45      0.47      9115
weighted avg       1.00      0.90      0.95      9115

Recall is  [0.         0.89928689]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
