According to YouTube's terms of service, a video which "Promis[es] money, products, software, or gaming perks for free if viewers install software, download an app, or perform other tasks." is spam, posting these videos is against YouTube's trms of service. This project uses machine learning and statistical methods to find videos which fit this description and automatically report them.


The following types of content are not allowed on YouTube. Keep in mind this list isn't a complete list.

    Making exaggerated promises, such as claims that viewers can get rich fast or that a miracle treatment can cure chronic illnesses such as cancer.
    Promoting cash gifting or other pyramid schemes.
    Accounts dedicated to cash gifting schemes.
    Videos that promise "You'll make $50,000 tomorrow with this plan!"

Don’t post content on YouTube if it fits any of the descriptions noted below.

    Links to or promotes third-party services that artificially inflate metrics like views, likes, and subscribers
    Content linking to or promoting third-party view count or subscriber gaming websites or services
    Offering to subscribe to another creator’s channel only if they subscribe to your channel (“sub4sub”)
        Note: You're allowed to encourage viewers to subscribe, hit the like button, share, or leave a comment
    Content featuring a creator purchasing their views from a third party with the intent of promoting the service

    Here are some examples of content that’s not allowed on YouTube.

    A video testimonial in which a creator shows themselves successfully purchasing artificial page traffic from a third party
    A video in which a creator links to a third party artificial page traffic provider in a promotional or supportive context. For example: “I got 1 million subscribers on this video in a day and you can too!”
    A video that tries to force or trick viewers into watching another video through deceptive means (for example: a misleadingly labeled info card)
    Channels dedicated to artificial channel engagement traffic or promoting businesses that exist for this sole purpose

    #To Explain here:
    1. small sample of videos from any one channel, even if all vids on the channel are scam videos





In [22]:
import os
import json 
import pandas as pd
import pymongo
#from google.colab import drive
from pymongo import MongoClient
import socket
import urllib.request as urllib2
import pandas as pd
from urllib.error import HTTPError




In [2]:
#connect to the api
from googleapiclient.discovery import build
gkey="AIzaSyAxbjh1blqMTOdUOxNwmiFXv36cNwm4n6M"
youtube=build('youtube','v3', developerKey=gkey)

In [3]:
#Get up to 50 comments on a video plus all replies to these
def video_comments(video_id):
    try:
        counter=0
        # empty list for storing reply
        commentlist=[]
        replies = []

        # creating youtube resource object
        #youtube = build('youtube', 'v3',
                    # developerKey=gkey)

        # retrieve youtube video results
        video_response=youtube.commentThreads().list(
        part='snippet,replies',
        videoId=video_id
        ).execute()

        # iterate video response
        while video_response:
            
            # extracting required info
            # from each result object 
            for item in video_response['items']:
                
                # Extracting comments
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                
                # counting number of reply of comment
                replycount = item['snippet']['totalReplyCount']

                # if reply is there
                if replycount>0:
                    
                    # iterate through all reply
                    for reply in item['replies']['comments']:
                        
                        # Extract reply
                        reply = reply['snippet']['textDisplay']
                        
                        # Store reply is list
                        replies.append(reply)

                # print comment with list of reply
                commentlist.append([comment, replies])

                # empty reply list
                replies = []
                counter+=1

            # Again repeat
            if 'nextPageToken' in video_response and counter<=50:
                video_response = youtube.commentThreads().list(
                        part = 'snippet,replies',
                        videoId = video_id
                    ).execute()
            else:
                return(commentlist)
    except HTTPError as err:
        if err.code == 403:
            return("disabled")
        else:
            raise



In [4]:
#Get a dictionary of variables from one video

def getvars(id,is_spam):
  request=youtube.videos().list(
      id = id,
      part=["snippet","statistics"],
  )
  response=request.execute()
  
  vars={}

  vars["id"]=id
  
  if is_spam==1:
    vars["spam"]=1
  elif is_spam == 2:
    vars["spam"] = 2
  elif is_spam == 0:
    vars["spam"] = 0
  else:
     raise Exception("Second argument must be 0, 1, or 2 or (0 for OK, 1 for money-scam,2 for grey area (to be used to tweak model later).")
  

  statsvarlist=["commentCount", "dislikeCount","favoriteCount","likeCount","viewCount"]
  snipvarlist=["defaultAudioLanguage","description","tags","title","thumbnails"]

  snipvalues=response["items"][0]["snippet"]
  statsvalues=response["items"][0]["statistics"]

  for item in statsvarlist:
    if item in statsvalues:
      vars[item] = statsvalues[item]

  for item in snipvarlist:
    if item in snipvalues:
      vars[item] = snipvalues[item]

  vars["commentSection"]=video_comments(id)
  return(vars)



In [15]:
#Get up to 50 videos from a username
def get_ids_byuser(username):
  vidIdList=[]
  counter=0
    # empty list for storing reply
  
    # creating youtube resource object
    #youtube = build('youtube', 'v3',
                   # developerKey=gkey)
  
    # retrieve youtube video results
  request=youtube.channels().list(
      forUsername = username,
      part=["contentDetails"]
    
  )
  response=request.execute()
  uploads_id=response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]


  request=youtube.playlistItems().list(
      playlistId = uploads_id,
      part=["contentDetails"]
    
  )
  response=request.execute()
    
    # iterate video response
  while response:
        
        # extracting required info
        # from each result object 
        videoslist = response['items']
        for i in range(len(videoslist)):
          videoid=videoslist[i]["contentDetails"]["videoId"]
          vidIdList.append(videoid)
          counter+=1
  
        # Again repeat
        if 'nextPageToken' in response and counter<=200:
          request=youtube.playlistItems().list(
            playlistId = uploads_id,
            part=["contentDetails"],
            pageToken=response["nextPageToken"]
          )
        else:
            return(vidIdList)
  

    


In [16]:
#Get up to 50 videos from a username
def get_ids_bychannelid(ID):
  vidIdList=[]
  counter=0
    # empty list for storing reply
  
    # creating youtube resource object
    #youtube = build('youtube', 'v3',
                   # developerKey=gkey)
  
    # retrieve youtube video results
  request=youtube.channels().list(
      id = ID,
      part=["contentDetails"]
    
  )
  response=request.execute()
  uploads_id=response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]


  request=youtube.playlistItems().list(
      playlistId = uploads_id,
      part=["contentDetails"]
    
  )
  response=request.execute()
    
    # iterate video response
  while response:
        
        # extracting required info
        # from each result object 
        videoslist = response['items']
        for i in range(len(videoslist)):
          videoid=videoslist[i]["contentDetails"]["videoId"]
          vidIdList.append(videoid)
          counter+=1
  
        # Again repeat
        if 'nextPageToken' in response and counter<=300:
          request=youtube.playlistItems().list(
            playlistId = uploads_id,
            part=["contentDetails"],
            pageToken=response["nextPageToken"]
          )
        else:
            return(vidIdList)
  

    


In [7]:
#connect to mongodb

conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)


db = client.videos_mdb
collection = db["videos"]  

In [8]:
#Define a function to check if we already downloaded a video yet
def is_inthedb(id):
    count=collection.count_documents({"id":id})
    if count==0:
        return False
    else:
        return True

In [9]:
#Define a function to add video info to MongoDB by channel id
def sample_vids_bychannelid(id, is_scam):
  VidIds=get_ids_bychannelid(id)
  for subid in VidIds:
    if is_inthedb(subid)==False:

      data=getvars(subid,is_scam)
      with client:
            db = client.videos_mdb
            db.videos.insert_one(data)

    else:
      continue


In [10]:
#Define a function to add video info to MongoDB by username
def sample_vids_byusername(username, is_scam):
  VidIds=get_ids_byuser(username)
  for id in VidIds:
    if is_inthedb(id)==True:
      continue
    else:
      data=getvars(username,is_scam)
      with client:
            db = client.videos_mdb
            db.videos.insert_one(data)

In [11]:
#Function to fetch individual spam videos 
def getone(id, is_spam):
    if is_inthedb(id)==True:
        print("Already got it.")
    else:
        get=getvars(id,is_spam)
        db = client.videos_mdb
        db.videos.insert_one(get)

In [28]:
#Manually search for good channels and videos for my data
request=youtube.search().list(
    q= "John Crestani",
    part=["id", "snippet"],
    maxResults=5

)
response=request.execute()
response

{'kind': 'youtube#searchListResponse',
 'etag': 'd6SnBqEL-7XYaoLoBgQDpGcmGd0',
 'nextPageToken': 'CAUQAA',
 'regionCode': 'US',
 'pageInfo': {'totalResults': 48967, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#searchResult',
   'etag': 'XZFlF1P9JQ_OHvJtolU6SGADFRU',
   'id': {'kind': 'youtube#channel', 'channelId': 'UCP1WWozNcnnBX73ZoMSem5w'},
   'snippet': {'publishedAt': '2009-06-08T20:04:20Z',
    'channelId': 'UCP1WWozNcnnBX73ZoMSem5w',
    'title': 'John Crestani',
    'description': 'Growing up I was always a voracious learner with a penchant for travel. When I was in college I grew frustrated with the lack of real-world applications for the ...',
    'thumbnails': {'default': {'url': 'https://yt3.ggpht.com/ytc/AKedOLTmnFDPsroiPlWZ9kQOiNuoA0QlfWURjP1K1ZAfUA=s88-c-k-c0xffffffff-no-rj-mo'},
     'medium': {'url': 'https://yt3.ggpht.com/ytc/AKedOLTmnFDPsroiPlWZ9kQOiNuoA0QlfWURjP1K1ZAfUA=s240-c-k-c0xffffffff-no-rj-mo'},
     'high': {'url': 'https://yt3.ggpht.com/ytc/AKedOLTmnF

In [16]:
#Sampling non-scams 
#sample_vids_bychannelid("UCm22FAXZMw1BaWeFszZxUKw",0)  #Kitboga - complete
#sample_vids_byusername("SmoshGames",0)
#sample_vids_bychannelid("UCY30JRSgfhYXA6i6xX1erWg",0)  #Markiplier - complete
#sample_vids_bychannelid("UCrPUg54jUy1T_wII9jgdRbg", 0) #Chris Rhamsay
#sample_vids_bychannelid("UCnZj2VMd3IdyzIKOJCK4VlA",0)
#---Non-scam videos that have scam-like qualities such as relevant keywords or legitimate product advertisements (so the machine can make finer distinctions)
#
#sample_vids_bychannelid("UCm22FAXZMw1BaWeFszZxUKw",0) #--Kitboga
#sample_vids_bychannelid("UCTCpOFIu6dHgOjNJ0rTymkQ",0) # as seen on TV -- complete
#sample_vids("Jim Browning",0)
#sample_vids("Freakin' Reviews",0)
#sample_vids("Chadtronic",0)
#sample_vids_bychannelid("UCZLFu8bHbwtnIgWLg5UtINw",0)#VidIQ. This one is important, because it shows you how to get subscribers the right way (rather than selling giftcards or subs for subs). --complete#

#---Normal non-scam videos
#sample_vids("Insym",0)
#sample_vids_bychannelid("UCnmgSO_4g6QcRzy0yFeglyA",0)#Grand Illusions --complete
#sample_vids_bychannelid("UC1VLQPn9cYSqx8plbk9RxxQ",0)# The action lab --complete
#sample_vids("Chris Ramsay",0)
#goodlist=["tYQY1UKDLFM","PWQDccL0aXM","eoxJWJaA1gc","PCimuf6F6C8","7rfVE2JvLqA"]
#moregoods=["sJqyaSV6E7c"]
#for item in goodlist:
#    try:
#        getone(item,0)
#    except:
#        continue


parodies_and_education = ["gWDAU4vTM7Y","Uenf1EHZRrQ"]




In [25]:
#Sampling scams


#sample_vids_bychannelid("UCAT3-AQKNU0ITQXnjLOoDWA",1) #Wesley Virgin - complete
#sample_vids_bychannelid("UCBnYn54boCxNoob5DXsy_ag",1) #Digital Millionaire -complete
#sample_vids_bychannelid("UClVGRVvggdqZT02kjiVt0IQ",1) #Dave Nick --complete
#sample_vids_bychannelid("UCC2Sqxq54NVM87b-gV1mKag",1)  #finance girl

#getone("KH-i2P92bS4",1)
#-----get all from "finance girl"
#spamlist=["jK7xrgOeOQg","feAfP_MWz5g","gnNyEBdBkO4","H-i2P92bS4","6RLYyE3dDLE","mAg7Qs-XifE","1DL1xnmkbJM","QDfXqGn4Bmo&t=138s","UPkEZ0Rl11k&t=169s","h6sbddOaI88","Q7O5aKKm4uM","B75deAvCw9o","Y5osifxyCSU&t=55s", "97Z37QRz2a4", "YgEKUE9vwPk","QuS0HqXx9sI","K-n06-1eS2A","Gs9saVFwyno","QpuVeL9IKCk","6ixsQInp11U","i-Lg4efoOJY","PvOi6uxtLIc","WsZllvBTNvA","cpEA1050d_s","qHWfP56Xo2E","ixpI0jBM_ps","z-6Ol9Gu2Bw","5C6GrJzc5zM","aj5WXrUfb0U","Gs9saVFwyno","SUFrcNpFYaw","lGTZf4AuaRY","ayXCcOOFWCk","fyzUXGmccKI","T7aPZo09ToM","gU_D_SkxpOY","XNE0jrfZs28","qvG7TDnzuf0","clzqH8jXlhY","SV_4nhfKKYo","apXcI7QnTzk","ioT5aRzPWFY","p5TUdv4G1K0","M9YKeor__8A&t=8s","dnsVDw5NqPs","Jko7MDeAzzo","umbpntB65uc","82hRKeuZav0","1XgVTx4j4c0"]

#sample_vids_bychannelid("UCP1WWozNcnnBX73ZoMSem5w",1) #john christani
#sample_vids_bychannelid("UCR54Ec2mGPnJgq72ZNQTtWw",1) #michael cove
#sample_vids_bychannelid("UCCH0gM_-7RYWsjONdgohEkg",1) #cashina
#sample_vids_bychannelid("UCEjCdYwNLBKAq0hSUpxa1lg",1) #Jaffry Ward
#sample_vids_bychannelid("UCYONTR6PoZVJ4EcbOz3qdtw",1)
#sample_vids_bychannelid("UCSJ3rGr-JC7OwB1YWlNfoEA",1)
#sample_vids_bychannelid("UC0kC_8hca4_aECYCM4_4Shw",1)
#sample_vids_bychannelid("UC22qq_6d3k59dHkXQR8-Peg",1)
#sample_vids_bychannelid("UCjxzZR5H3_WiIQgLMNuX0FA",1)
#sample_vids_bychannelid("UCxeR0XmPyYhKp4DuDCONPZw",1)
#sample_vids_bychannelid("UCnTVnVxl2KKTLp-vf3PEi6g",1)
sample_vids_bychannelid("UCzyDzU9WgkMhdSnHHu6Fqjg",1)

In [None]:
#Manually search for the best usernames' channelID
request=youtube.search().list(
    q= "Zork",
    part=["id", "snippet"],
    maxResults=5

)
response=request.execute()


In [None]:
#TO DO

#replace comment-count missing data with 0, but add new dichotomous variable for disabled comments
#recode default audio language to be numeric
#delete repeat comments. I think there was an error collecting them at one point
#document the fact that missing a default audio langauge will not be recorded as missing data, but will be categorical data. This is because a video missing a default language might be a relevant predictor of scam-status
#be POSITIVE that video and channel id do not end up as variables
#add vars likecount/viewcount, dislikecount/viewcount
#eliminte "favoritecount"variable. It's value is either always 0, or 0 too often and will create skew.
#drop duplicate records
#handle coding of comment section disabled --- fixed in retrival, coded as "disabled", but may need more handling

In [None]:
#step 0: collect
#step 1: have one or more nested machine learning algorithms predict spamm dummy based on qualitative vars like title, tags, and comments
#step 2: have a parent algorithm (or regression equation) predict spammyness based on other values + output of nested predictions as factors

In [40]:
  request=youtube.channels().list(
      id = "UCP1WWozNcnnBX73ZoMSem5w",
      part=["statistics"]
    
  )
  response=request.execute()



In [41]:
response

{'kind': 'youtube#channelListResponse',
 'etag': 'sT6KH257MODRFv_jqgOpPylaEXg',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': '2YloBpNLteR62J5r6AO8X3W1ZYg',
   'id': 'UCP1WWozNcnnBX73ZoMSem5w',
   'statistics': {'viewCount': '24492920',
    'subscriberCount': '504000',
    'hiddenSubscriberCount': False,
    'videoCount': '495'}}]}

In [34]:
#scams
#scamlist2=["7yYvCIUjx7o","_vcBDMq6PkM","784LEikg8_o","J90cJfKlhEY","EkLFsL4KevU"]
#for item in scamlist2:
#    try:
#        getone(item,1)
#    except:
#        print("skip")


#getone("Xb3zPtkw8IQ",1)
#getone("zCXK4H0Vits",1)
#getone("nKo8SeFtd68",1)
getone("9iwvdsU59Fo",1)

In [16]:
#non-scam giveaways that follow YT's rules decently well
#goodlist2=["_ltiL-AyRAk","3gh1AdtQKWQ"]

#for item in goodlist2:
#    try:
#        getone(item,0)
#    except:
#        print("skip")

In [17]:
#Well-meaning people (I think) not following the giveaway rules 

greylist=["BOByZjhFRmw","cEBNadvCJbs","KfArrqQtao0","iJj_Ikx_uDY","aAtJ1zC2LPk","005ON0SKk9Q&t=94s","m6gLe45zSsw","-dBwZdc_c0M","Co0_HVab0vw","VBozk2qZEpg","PC54M2M4hB0", "eqyDoWveDNg"]

for item in greylist:
    try:
        getone(item,2)
    except:
        print("skip")

skip


In [None]:
#doesn't follow giveaway rules and is also kinda sus but not obviously horrible
YHn1xTy-uY0
Cv9yNX9JkAc
ceb2MQyXzwc
sX1xZCwE5WY
QKoCP0x5h_M
sD0cLKzzHRI