# Data Preparation for EDA and Modeling

In [1]:
# Imports
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as ticker
import matplotlib.image as mpimg
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
import re
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import CountVectorizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings
warnings.filterwarnings("ignore")

## Defining Processing Functions

In [2]:
def game_cleaner(title):
    cleaned_game = title.lower()
    cleaned_game = cleaned_game.replace("starcraft2co-op","starcraft2")
    cleaned_game = cleaned_game.replace("starcraft2brutal+6","starcraft2")
    cleaned_game = cleaned_game.replace("warcraft3reforged","warcraft3")
    cleaned_game = cleaned_game.replace("lowkovszelda","zelda")
    cleaned_game = cleaned_game.replace("lowkoplays...starcraft","starcraft")
    cleaned_game = cleaned_game.replace("dotaunderlords","underlords")
    cleaned_game = cleaned_game.replace("worldofwarcraftrp","worldofwarcraft")
    cleaned_game = cleaned_game.replace("lowkoplaysworldofwarcraft","worldofwarcraft")
    cleaned_game = cleaned_game.replace("lowkovsabzu","sabzu")
    cleaned_game = cleaned_game.replace("lowkovsaplaguetale","aplaguetale")
    cleaned_game = cleaned_game.replace("lowkovssekiro","sekiro")
    cleaned_game = cleaned_game.replace("hearthstonebattlegrounds","hearthstone")
    return cleaned_game

In [3]:
# Preprocessor to help tokenize data
def splitter(input_message):
    message = input_message
    container = []
    # These stop words don't add much to the analysis.  Kept other common stop words
    special_stops = ['of','OF','is','the','THE','by','BY','it','in','on','and','but','being','an',
                     'for','to','they','any','from','then','some','you','your','their','as','about',
                     'out','with','his','hers','he','she','at','go','be']
    
    # Identifies special numbers that we want to keep from the data
    # Usually involves computer parts (memory, GPU), numbers found in video game titles, or matchups (1 vs 1, 2 vs 2)
    comp_numbers = [2077,32,64,128,256,512,1024,1080,3080,60,144,2018,2019,2020,2021,2022,100,200,1,2,3,4]
    
    # specific player cleanup
    message = message.replace("dongraegu","drg")
    message = message.replace("dong rae gu",'drg')
    message = message.replace("rattata",'vanya')
    message = message.replace("liquid","")
    message = message.replace("dark templar","darktemplar")
    message = message.replace("dark shrine","darkshrine")

    # Obtains the first word before the column which is usually a video game title
    splitted = message.split(":")
    index = 0
    if ":" in message:
        container.append(game_cleaner("".join(message.split(":")[0].split())))
        index = 1
    
    # Starts overall split of the data
    for word in message.split():
        if (index == 1) | (":" in word):
            index = 2
            continue
        # Removes punctuations and special characters
        all_upper = word.strip("!?()#&*,")
        # Removes possessives
        all_upper = all_upper.replace("'s","").replace("'d","").replace("'","")

        # Removes numbers not in our accepted numbers list
        if all_upper.isdigit():
            if (int(all_upper) in comp_numbers):
                pass
            else:
                continue
                


        # Keeping words that are fully capitalized, otherwise, set word to lowercase
        if all_upper == all_upper.upper():
            container.append(all_upper)
        else:
            # Catches stop words
            if all_upper.lower() in special_stops:
                continue
            else:
                container.append(all_upper.lower())
    joined = " ".join(container)
    return joined

In [4]:
# Pulls out special words from predefined list
def get_from_list(message, special_list):
    container = []
    splitted = splitter(message)
    for word in splitted.split():
        if word.lower() in special_list:
            container.append(word.lower())
    return container

In [5]:
# Pulls out specific race or country info for players
def get_player_info(message,special_list,special_df, info=None):
    info_df = special_df.copy()
    info_df['tag'] = info_df['tag'].apply(lambda x: x.lower())
    container = []
    players = []
    splitted = splitter(message)
    if (info == 'race') | (info == 'country'):
        for word in splitted.split():
            if word.lower() in special_list:
                row = special_df.loc[info_df['tag'] == word.lower()]
                value = row[info].values[0]
                if word.lower() not in players:
                    container.append(value)
                    players.append(word.lower())
    # handles special cases
    if len(container)==3:
        container.pop(0)
    return container

In [6]:
# Extracts desired information from video stats
# Had to remove dislike count as they were removed from YouTube recently
def get_stats(video):
    vid_id = video['id']
    title = video['snippet']['title']
    date = video['snippet']['publishedAt']
    duration = video['contentDetails']['duration'] #example PT1H1M31S
    thumbnail = video['snippet']['thumbnails']['medium']['url']
    tags = video['snippet']['tags']
    views = video['statistics']['viewCount']
    likes = video['statistics']['likeCount']
    return {'vid_id':vid_id,'title':title,'date':date,'duration':duration,
            'thumbnail':thumbnail,'tags':tags,'views':views,'likes':likes}

In [7]:
# Convert duration to minutes
def dur_to_min(time):
    
    # Using regex to parse time
    nph = '(?=\d{1,2}H)(\d{1,2})'
    npm = '(?=\d{1,2}M)(\d{1,2})'
    nps = '(?=\d{1,2}S)(\d{1,2})'
    ph = re.compile(nph)
    pm = re.compile(npm)
    ps = re.compile(nps)
    h = ph.findall(time)
    m = pm.findall(time)
    s = ps.findall(time)
    
    # Converts to float
    if h:
        h = float(h[0])
    else:
        h = 0.0
    if m:
        m = float(m[0])
    else:
        m = 0.0
    if s:
        s = float(s[0])
    else:
        s = 0.0
        
    # Calculates total minutes
    minutes = (h*60)+m+(s/60.0)
    return round(minutes,2)

## Importing data pulled from APIs

In [8]:
all_stats = pickle.load(open(r"Data/all_stats.p","rb"))
players = pickle.load(open(r"Data/players_df.pickle","rb"))

In [9]:
# converting all_stats info a dataframe
main_df = pd.DataFrame.from_dict(all_stats).apply(lambda x: get_stats(x), axis=1).apply(pd.Series)

In [10]:
main_df['views'] = main_df['views'].astype(int)
main_df['likes'] = main_df['likes'].astype(int)
main_df['minutes'] = main_df['duration'].apply(lambda x: dur_to_min(x))

In [11]:
main_df.head()

Unnamed: 0,vid_id,title,date,duration,thumbnail,tags,views,likes,minutes
0,LHDQ8iSbdck,StarCraft 2: EPIC COMEBACKS! (Scarlett vs ByuN),2021-12-21T19:00:14Z,PT51M,https://i.ytimg.com/vi/LHDQ8iSbdck/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",8088,500,51.0
1,IM7OdVKFW4M,StarCraft 2: Maru LOVES CHEESING Cure! (Best-o...,2021-12-19T18:54:54Z,PT45M38S,https://i.ytimg.com/vi/IM7OdVKFW4M/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",57301,1502,45.63
2,ReDXHIf6cuc,StarCraft 2: Can MaxPax OVERTAKE ShoWTimE's To...,2021-12-18T19:00:18Z,PT21M41S,https://i.ytimg.com/vi/ReDXHIf6cuc/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",43247,1400,21.68
3,kWfuYGNq1C4,StarCraft 2: Dark's Epic NEW Micro Trick! (Byu...,2021-12-17T19:00:30Z,PT31M4S,https://i.ytimg.com/vi/kWfuYGNq1C4/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",71067,2080,31.07
4,IUqQ_rVnSXg,StarCraft 2: MASSING ORACLES? (Lambo vs herO),2021-12-16T19:00:05Z,PT35M6S,https://i.ytimg.com/vi/IUqQ_rVnSXg/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",69353,1839,35.1


In [12]:
# Only keeping the last 3 years of YouTube video data
current_df = main_df.loc[main_df['date']>='2018']

In [13]:
current_df

Unnamed: 0,vid_id,title,date,duration,thumbnail,tags,views,likes,minutes
0,LHDQ8iSbdck,StarCraft 2: EPIC COMEBACKS! (Scarlett vs ByuN),2021-12-21T19:00:14Z,PT51M,https://i.ytimg.com/vi/LHDQ8iSbdck/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",8088,500,51.00
1,IM7OdVKFW4M,StarCraft 2: Maru LOVES CHEESING Cure! (Best-o...,2021-12-19T18:54:54Z,PT45M38S,https://i.ytimg.com/vi/IM7OdVKFW4M/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",57301,1502,45.63
2,ReDXHIf6cuc,StarCraft 2: Can MaxPax OVERTAKE ShoWTimE's To...,2021-12-18T19:00:18Z,PT21M41S,https://i.ytimg.com/vi/ReDXHIf6cuc/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",43247,1400,21.68
3,kWfuYGNq1C4,StarCraft 2: Dark's Epic NEW Micro Trick! (Byu...,2021-12-17T19:00:30Z,PT31M4S,https://i.ytimg.com/vi/kWfuYGNq1C4/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",71067,2080,31.07
4,IUqQ_rVnSXg,StarCraft 2: MASSING ORACLES? (Lambo vs herO),2021-12-16T19:00:05Z,PT35M6S,https://i.ytimg.com/vi/IUqQ_rVnSXg/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",69353,1839,35.10
...,...,...,...,...,...,...,...,...,...
1247,2KJv_9IbrZ0,StarCraft 2: THE PLANETARY FORTRESS CONTAIN!,2018-01-06T21:00:00Z,PT22M19S,https://i.ytimg.com/vi/2KJv_9IbrZ0/mqdefault.jpg,"[lowko, lowkotv, tutorial, guide, commentary, ...",228100,4394,22.32
1248,aOqlr2tN4Eo,StarCraft 2: ESCAPE ON THE HYPERION!,2018-01-05T21:00:00Z,PT32M4S,https://i.ytimg.com/vi/aOqlr2tN4Eo/mqdefault.jpg,"[StarCraft 2, StarCraft 2: Wings of Liberty, W...",165132,4039,32.07
1249,Age4eNRUMrk,StarCraft 2: THE ZERG... MOTHERSHIP?!,2018-01-04T21:00:01Z,PT28M30S,https://i.ytimg.com/vi/Age4eNRUMrk/mqdefault.jpg,"[lowko, lowkotv, tutorial, guide, commentary, ...",1783323,32859,28.50
1250,czuUoHltWpc,StarCraft 2: MASS SKY ZERG! (Triple Threat),2018-01-03T21:00:01Z,PT20M34S,https://i.ytimg.com/vi/czuUoHltWpc/mqdefault.jpg,"[starcraft 2, starcraft 2 co-op, co-op mission...",60053,1143,20.57


## Cleaning the Player Data

In [14]:
# Converting string data to lowercase
players['country'] = players['country'].apply(lambda x: str(x).lower())
players['race'] = players['race'].apply(lambda x: str(x).lower())
players['tag'] = players['tag'].apply(lambda x: str(x).lower())

In [15]:
# Removing player names too similar to other commonly used words in titles
remove = ['jim','punk','thor','hyperion','probe','control','alpha','fenix','golden','fear','flood','strange',
          'terran','zerg','scv','nexus','reaper','meat','blink','chance','mechanics','wave','next','nice','zero',
          'shadow','raise','job','doctor','time','has','phoenix','raise','sortof','dns','keen',
          'cham','prototype','academy','ranger','blacksmith','faith','eternity','chase','crimson',
          'albion','fate','tears','coffee','monster','ready','hunter','ling','turn','master','risky']

In [16]:
dropped_players = players.loc[~players['tag'].isin(remove)]

In [17]:
dropped_players.head()

Unnamed: 0,country,id,race,resource_uri,tag
0,fi,485,z,/api/v1/player/485/,serral
1,kr,49,t,/api/v1/player/49/,maru
2,fr,5878,t,/api/v1/player/5878/,clem
3,it,5414,z,/api/v1/player/5414/,reynor
4,kr,76,z,/api/v1/player/76/,dark


In [18]:
player_cleaning_df = current_df.copy()

In [19]:
player_cleaning_df

Unnamed: 0,vid_id,title,date,duration,thumbnail,tags,views,likes,minutes
0,LHDQ8iSbdck,StarCraft 2: EPIC COMEBACKS! (Scarlett vs ByuN),2021-12-21T19:00:14Z,PT51M,https://i.ytimg.com/vi/LHDQ8iSbdck/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",8088,500,51.00
1,IM7OdVKFW4M,StarCraft 2: Maru LOVES CHEESING Cure! (Best-o...,2021-12-19T18:54:54Z,PT45M38S,https://i.ytimg.com/vi/IM7OdVKFW4M/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",57301,1502,45.63
2,ReDXHIf6cuc,StarCraft 2: Can MaxPax OVERTAKE ShoWTimE's To...,2021-12-18T19:00:18Z,PT21M41S,https://i.ytimg.com/vi/ReDXHIf6cuc/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",43247,1400,21.68
3,kWfuYGNq1C4,StarCraft 2: Dark's Epic NEW Micro Trick! (Byu...,2021-12-17T19:00:30Z,PT31M4S,https://i.ytimg.com/vi/kWfuYGNq1C4/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",71067,2080,31.07
4,IUqQ_rVnSXg,StarCraft 2: MASSING ORACLES? (Lambo vs herO),2021-12-16T19:00:05Z,PT35M6S,https://i.ytimg.com/vi/IUqQ_rVnSXg/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",69353,1839,35.10
...,...,...,...,...,...,...,...,...,...
1247,2KJv_9IbrZ0,StarCraft 2: THE PLANETARY FORTRESS CONTAIN!,2018-01-06T21:00:00Z,PT22M19S,https://i.ytimg.com/vi/2KJv_9IbrZ0/mqdefault.jpg,"[lowko, lowkotv, tutorial, guide, commentary, ...",228100,4394,22.32
1248,aOqlr2tN4Eo,StarCraft 2: ESCAPE ON THE HYPERION!,2018-01-05T21:00:00Z,PT32M4S,https://i.ytimg.com/vi/aOqlr2tN4Eo/mqdefault.jpg,"[StarCraft 2, StarCraft 2: Wings of Liberty, W...",165132,4039,32.07
1249,Age4eNRUMrk,StarCraft 2: THE ZERG... MOTHERSHIP?!,2018-01-04T21:00:01Z,PT28M30S,https://i.ytimg.com/vi/Age4eNRUMrk/mqdefault.jpg,"[lowko, lowkotv, tutorial, guide, commentary, ...",1783323,32859,28.50
1250,czuUoHltWpc,StarCraft 2: MASS SKY ZERG! (Triple Threat),2018-01-03T21:00:01Z,PT20M34S,https://i.ytimg.com/vi/czuUoHltWpc/mqdefault.jpg,"[starcraft 2, starcraft 2 co-op, co-op mission...",60053,1143,20.57


### Adding labels for players in title

In [20]:
player_list = dropped_players.tag.values

In [21]:
# Creates a column with a list of players found in title
player_cleaning_df['players'] = player_cleaning_df['title'].apply(lambda x: get_from_list(x,player_list))

In [22]:
# Creates a column for each player and add 1 if player is in title
for i in range(len(player_list)):
    for player in player_cleaning_df.iloc[i].players:
        player_cleaning_df.at[i,player]=1

### Finding average views and number of videos for each player

In [23]:
avg_views = []
count_views = []

# Find average views and number of videos, 0 if none
for player in player_list:
    try:
        avg_views.append(player_cleaning_df.loc[player_cleaning_df[player]==True].views.mean())
        count_views.append(player_cleaning_df.loc[player_cleaning_df[player]==True].views.count())
    except:
        avg_views.append(0)
        count_views.append(0)

In [24]:
# Add these values to the player info data frame
dropped_players['avg_views'] = avg_views
dropped_players['num_videos'] = count_views

In [25]:
dropped_players.head()

Unnamed: 0,country,id,race,resource_uri,tag,avg_views,num_videos
0,fi,485,z,/api/v1/player/485/,serral,157787.45283,53
1,kr,49,t,/api/v1/player/49/,maru,146143.527778,36
2,fr,5878,t,/api/v1/player/5878/,clem,142728.652174,46
3,it,5414,z,/api/v1/player/5414/,reynor,140566.318182,44
4,kr,76,z,/api/v1/player/76/,dark,129995.939394,33


## Getting Country Data

In [26]:
countries_list = np.array(list(set(dropped_players.country)))

In [27]:
countries_list[:5]

array(['ve', 'pe', 'jp', 'hu', 'ee'], dtype='<U4')

In [28]:
countries_list.astype(str)

array(['ve', 'pe', 'jp', 'hu', 'ee', 'ph', 'lu', 'fi', 'eg', 'at', 'mx',
       'hr', 'dk', 'us', 'cr', 'si', 'uz', 'hk', 'lv', 'gt', 'tr', 'fr',
       'co', 'no', 'cn', 'ro', 'cz', 'by', 'none', 'id', 'se', 'il', 'lt',
       'za', 'ie', 'bo', 'it', 'dz', 'pl', 'be', 'de', 'is', 'cl', 'mm',
       'tw', 'bd', 'ua', 'ba', 'ru', 'au', 'uk', 'sk', 'ar', 'kr', 'ca',
       'cu', 'ch', 'vn', 'in', 'nl', 'br', 'es', 'my', 'bg', 'sg', 'nz'],
      dtype='<U4')

In [29]:
# Adds list of countries associated with the player names found in titles
player_cleaning_df['country'] = player_cleaning_df['title'].apply(
                                lambda x: get_player_info(x,player_list,dropped_players,info='country'))

In [30]:
for i in range(len(player_cleaning_df)):
    for country in player_cleaning_df.iloc[i].country:
        player_cleaning_df.at[i,country]=1
player_cleaning_df.fillna(0,inplace=True)

In [31]:
avg_views_country = []
count_views_country = []

# Find average views and number of videos for countries
for country in countries_list:
    try:
        avg_views_country.append(player_cleaning_df.loc[player_cleaning_df[country]==1].views.mean())
        count_views_country.append(player_cleaning_df.loc[player_cleaning_df[country]==1].views.count())
    except:
        avg_views_country.append(0)
        count_views_country.append(0)

In [32]:
countries_df = pd.DataFrame(data=[countries_list])

In [33]:
countries_df = countries_df.T

In [34]:
countries_df['avg_views'] = avg_views_country
countries_df['num_videos'] = count_views_country

In [35]:
countries_df.columns = ['country_code','avg_views','num_videos']

In [36]:
countries_df

Unnamed: 0,country_code,avg_views,num_videos
0,ve,0.0,0
1,pe,237502.0,1
2,jp,0.0,0
3,hu,0.0,0
4,ee,0.0,0
...,...,...,...
61,es,76507.0,1
62,my,0.0,0
63,bg,0.0,0
64,sg,0.0,0


In [37]:
# imports csv with country codes and names
ccodes = pd.read_csv(r'Data\country_codes.csv',encoding='latin-1')
ccodes['Code'] = ccodes['Code'].apply(lambda x: str(x).lower().strip())

In [38]:
country_zip = countries_df['country_code']

In [39]:
country_container = []
for country in country_zip:
    try:
        country_container.append(ccodes.loc[ccodes['Code']==country,'Country'].values[0])
    except:
        country_container.append("None")

In [40]:
countries_df['country'] = country_container

In [41]:
countries_df.head()

Unnamed: 0,country_code,avg_views,num_videos,country
0,ve,0.0,0,"Venezuela, Bolivarian Republic of"
1,pe,237502.0,1,Peru
2,jp,0.0,0,Japan
3,hu,0.0,0,Hungary
4,ee,0.0,0,Estonia


## Getting SC2 Race Data

In [42]:
# Following similar logic to the countries, except the totals will have number of races instead of just a label

In [43]:
# Starcraft 2 races = Zerg, Protoss, Terran
sc2races = ['z','p','t']

In [44]:
# Adds list of countries associated with the player names found in titles
player_cleaning_df['sc2race'] = player_cleaning_df['title'].apply(
                                lambda x: get_player_info(x,player_list,dropped_players,info='race'))
player_cleaning_df['z'] = 0
player_cleaning_df['t'] = 0
player_cleaning_df['p'] = 0

In [45]:
player_cleaning_df['sc2race']

0       [z, t]
1       [t, t]
2       [p, p]
3       [z, t]
4       [z, p]
         ...  
1247        []
1248        []
1249        []
1250        []
1251        []
Name: sc2race, Length: 1252, dtype: object

In [46]:
for i in range(len(player_cleaning_df)):
    for race in player_cleaning_df.iloc[i].sc2race:
        try:
            player_cleaning_df.at[i,race]+=1
        except:
            player_cleaning_df.at[i,race]=1

In [47]:
avg_views_race = []
count_views_race = []

# Find average views and number of videos for sc2 races
for race in sc2races:
    try:
        avg_views_race.append(player_cleaning_df.loc[player_cleaning_df[race]>0].views.mean())
        count_views_race.append(player_cleaning_df.loc[player_cleaning_df[race]>0].views.count())
    except:
        avg_views_race.append(0)
        count_views_race.append(0)

In [48]:
sc2races_df = pd.DataFrame(data=[sc2races])
sc2races_df = sc2races_df.T

In [49]:
sc2races_df['avg_views'] = avg_views_race
sc2races_df['num_videos'] = count_views_race
sc2races_df.columns = ['race','avg_views','num_videos']

In [50]:
sc2races_df

Unnamed: 0,race,avg_views,num_videos
0,z,133687.225941,239
1,p,119526.383333,180
2,t,128016.71134,194


## Getting Matchup Data
- Focusing on just the 1 vs 1 matchups for this EDA, which is the most popular in this channel

In [51]:
# 6 combinations of matchups for 3 races
sc2matchups = ['ZvZ','ZvP','ZvT','TvT','TvP','PvP']

In [52]:
for matchup in sc2matchups:
    player_cleaning_df[matchup] = 0

In [53]:
# Assigns a label for each combination of matchup
for i in range(len(player_cleaning_df)):
    race = player_cleaning_df.iloc[i].sc2race
    if len(race) == 2:
        if ("z" in race) & ("t" in race):
            player_cleaning_df.at[i,'ZvT']=1
        elif ("z" in race) & ("p" in race):
            player_cleaning_df.at[i,'ZvP']=1
        elif ("t" in race) & ("p" in race):
            player_cleaning_df.at[i,'TvP']=1
        elif ("z" in race):
            player_cleaning_df.at[i,'ZvZ']=1
        elif ("t" in race):
            player_cleaning_df.at[i,'TvT']=1
        elif ("p" in race):
            player_cleaning_df.at[i,'PvP']=1

In [54]:
matchup_df = pd.DataFrame(data=[sc2matchups])
matchup_df = matchup_df.T

In [55]:
avg_views_matchup = []
count_views_matchup = []

# Find average views and number of videos for sc2 races
for matchup in sc2matchups:
    try:
        avg_views_matchup.append(player_cleaning_df.loc[player_cleaning_df[matchup]>0].views.mean())
        count_views_matchup.append(player_cleaning_df.loc[player_cleaning_df[matchup]>0].views.count())
    except:
        avg_views_matchup.append(0)
        count_views_matchup.append(0)

In [56]:
matchup_df['avg_views'] = avg_views_matchup
matchup_df['num_videos'] = count_views_matchup
matchup_df.columns = ['matchup','avg_views','num_videos']

In [57]:
matchup_df

Unnamed: 0,matchup,avg_views,num_videos
0,ZvZ,119621.0,23
1,ZvP,141852.568966,58
2,ZvT,148570.689655,87
3,TvT,93882.444444,18
4,TvP,115359.255319,47
5,PvP,71545.333333,18


## Getting Game Data
- Game titles are usually before the ":" in the titles

In [58]:
def game_split(message):
    if ":" in message:
        return (game_cleaner("".join((message.lower().split(":")[0].split()))))

In [59]:
player_cleaning_df['game'] = player_cleaning_df['title'].apply(lambda x:game_split(x))

In [60]:
player_cleaning_df['game'].fillna("none",inplace=True)

In [61]:
player_cleaning_df['game'].value_counts()

starcraft2                978
none                      119
warcraft3                  49
frostpunk                  40
ageofempires4              11
zelda                      11
starcraft                  11
sekiro                      8
theyarebillions             4
underlords                  4
worldofwarcraft             3
hearthstone                 2
arise                       1
ironharvest                 1
thestanleyparable           1
mutantyearzero              1
fortnite                    1
farcry5                     1
tropico6                    1
worldofwarcraftclassic      1
darksouls3                  1
diablo4announcement         1
sabzu                       1
aplaguetale                 1
Name: game, dtype: int64

In [62]:
player_cleaning_df['game'].value_counts()

starcraft2                978
none                      119
warcraft3                  49
frostpunk                  40
ageofempires4              11
zelda                      11
starcraft                  11
sekiro                      8
theyarebillions             4
underlords                  4
worldofwarcraft             3
hearthstone                 2
arise                       1
ironharvest                 1
thestanleyparable           1
mutantyearzero              1
fortnite                    1
farcry5                     1
tropico6                    1
worldofwarcraftclassic      1
darksouls3                  1
diablo4announcement         1
sabzu                       1
aplaguetale                 1
Name: game, dtype: int64

In [63]:
game_df = player_cleaning_df[['game','views']].groupby('game').agg(['mean','count']).reset_index()
game_df.columns = ['game_title','avg_views','num_videos']

In [64]:
game_df

Unnamed: 0,game_title,avg_views,num_videos
0,ageofempires4,68345.363636,11
1,aplaguetale,30590.0,1
2,arise,17146.0,1
3,darksouls3,15482.0,1
4,diablo4announcement,74531.0,1
5,farcry5,23703.0,1
6,fortnite,18326.0,1
7,frostpunk,31125.125,40
8,hearthstone,20215.5,2
9,ironharvest,101742.0,1


# Word Count Tokenizer

In [65]:
# These stop words don't add much to the analysis.  Kept other common stop words
special_stops = ['of','OF','is','the','THE','by','BY','it','in','on','and','but','being','an','for','to','they','any','from',
                'then','some','you','your','their','as','about','out','with','his','hers','he','she','at','go']

## One Word Tokenizer

In [66]:
one_words = CountVectorizer(preprocessor=splitter,stop_words=special_stops)
one_words_counter = one_words.fit_transform(player_cleaning_df['title'])

In [67]:
one_words.vocabulary_

{'starcraft2': 1669,
 'EPIC': 309,
 'COMEBACKS': 205,
 'scarlett': 1620,
 'vs': 1768,
 'byun': 1168,
 'maru': 1449,
 'LOVES': 544,
 'CHEESING': 189,
 'cure': 1223,
 'best': 1143,
 'can': 1170,
 'maxpax': 1458,
 'OVERTAKE': 683,
 'showtime': 1637,
 'top': 1722,
 'position': 1553,
 'dark': 1230,
 'epic': 1281,
 'NEW': 643,
 'micro': 1466,
 'trick': 1727,
 'MASSING': 572,
 'ORACLES': 670,
 'lambo': 1411,
 'hero': 1364,
 'WORST': 1057,
 'grand': 1346,
 'finals': 1300,
 'ever': 1282,
 'serral': 1631,
 'rogue': 1609,
 'lowko': 1433,
 'explains': 1286,
 'UNIT': 999,
 'COUNTERS': 221,
 'highlights': 1370,
 'TOP': 966,
 'LEVEL': 526,
 'STARCRAFT': 877,
 'zerg': 1800,
 'terran': 1711,
 'mass': 1451,
 'COMMAND': 207,
 'CENTER': 172,
 'strategy': 1679,
 'reynor': 1601,
 'ageofempires4': 1089,
 'empires': 1275,
 'chinese': 1187,
 'PROTOSS': 738,
 'aoe4': 1103,
 'live': 1428,
 'gameplay': 1325,
 'FRIENDLY': 376,
 'FIRE': 353,
 'clem': 1193,
 'trap': 1726,
 'SNEAKY': 860,
 'FUNGALS': 382,
 'heromarin

In [68]:
len(one_words.vocabulary_)

1805

## Bi/Tri gram Tokenizer

In [69]:
# Using a CountVectorizer with the predefined splitter function and special stop words for processing
# Also tokenizing for tri-grams to catch matchups X vs Y
grammed = CountVectorizer(preprocessor=splitter,stop_words=special_stops,ngram_range=(1,3))
grammedcounter = grammed.fit_transform(player_cleaning_df['title'])

In [70]:
grammed.vocabulary_

{'starcraft2': 7568,
 'EPIC': 1264,
 'COMEBACKS': 863,
 'scarlett': 7306,
 'vs': 9289,
 'byun': 5177,
 'starcraft2 EPIC': 7798,
 'EPIC COMEBACKS': 1271,
 'COMEBACKS scarlett': 864,
 'scarlett vs': 7317,
 'vs byun': 9424,
 'starcraft2 EPIC COMEBACKS': 7802,
 'EPIC COMEBACKS scarlett': 1272,
 'COMEBACKS scarlett vs': 865,
 'scarlett vs byun': 7319,
 'maru': 6480,
 'LOVES': 2320,
 'CHEESING': 802,
 'cure': 5470,
 'best': 5061,
 'starcraft2 maru': 8734,
 'maru LOVES': 6486,
 'LOVES CHEESING': 2323,
 'CHEESING cure': 807,
 'cure best': 5473,
 'starcraft2 maru LOVES': 8737,
 'maru LOVES CHEESING': 6487,
 'LOVES CHEESING cure': 2324,
 'CHEESING cure best': 808,
 'can': 5264,
 'maxpax': 6564,
 'OVERTAKE': 3102,
 'showtime': 7409,
 'top': 9110,
 'position': 6967,
 'starcraft2 can': 8599,
 'can maxpax': 5265,
 'maxpax OVERTAKE': 6567,
 'OVERTAKE showtime': 3103,
 'showtime top': 7413,
 'top position': 9113,
 'starcraft2 can maxpax': 8600,
 'can maxpax OVERTAKE': 5267,
 'maxpax OVERTAKE showtime'

In [71]:
len(grammed.vocabulary_)

9805

In [72]:
grammed_df = pd.DataFrame(data=grammedcounter.toarray(),columns=grammed.get_feature_names())

In [73]:
grammed_df.head()

Unnamed: 0,100,100 DRONES,100 DRONES maru,100 MUTALISKS,100 PLAYERS,100 PLAYERS ENTER,1000,1000 ZERGLINGS,1000 ZERGLINGS reynor,10TH,...,zest vs soo,zest vs special,zest vs stats,zest vs zoun,zest zerg,zest zerg vs,zoun,zvp,zvp INSANE,zvp INSANE neeb
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
grammed_df['views'] = player_cleaning_df['views']

In [75]:
grammed_df.head()

Unnamed: 0,100,100 DRONES,100 DRONES maru,100 MUTALISKS,100 PLAYERS,100 PLAYERS ENTER,1000,1000 ZERGLINGS,1000 ZERGLINGS reynor,10TH,...,zest vs special,zest vs stats,zest vs zoun,zest zerg,zest zerg vs,zoun,zvp,zvp INSANE,zvp INSANE neeb,views
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8088
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,57301
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,43247
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,71067
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,69353


In [76]:
highest_mean = []
for col in grammed_df.columns:
    highest_mean.append([col,grammed_df['views'].loc[grammed_df[col]==1].mean()])

In [77]:
sorted(highest_mean,reverse=True,key=(lambda x: x[1]))

[['starcraft2 ZERG MOTHERSHIP', 1783323.0],
 ['alphastar vs', 1089642.0],
 ['alphastar vs pro', 1089642.0],
 ['deepmind alphastar vs', 1089642.0],
 ['vs pro', 1089642.0],
 ['vs pro gamer', 1089642.0],
 ['cartooned original terran', 960836.0],
 ['original terran', 960836.0],
 ['original terran campaign', 960836.0],
 ['NUKES IN', 842129.0],
 ['starcraft2 TONS', 842129.0],
 ['starcraft2 TONS NUKES', 842129.0],
 ['grandmaster free', 673741.0],
 ['grandmaster free all', 673741.0],
 ['starcraft2 grandmaster', 673741.0],
 ['starcraft2 grandmaster free', 673741.0],
 ['DISGUSTING TERRAN', 671643.0],
 ['FLORENCIOS DISGUSTING', 671643.0],
 ['FLORENCIOS DISGUSTING TERRAN', 671643.0],
 ['starcraft2 FLORENCIOS DISGUSTING', 671643.0],
 ['HARDEST difficulty', 625716.0],
 ['brutal HARDEST', 625716.0],
 ['brutal HARDEST difficulty', 625716.0],
 ['co op brutal', 625716.0],
 ['op brutal', 625716.0],
 ['op brutal HARDEST', 625716.0],
 ['cartooned original zerg', 621772.0],
 ['original zerg', 621772.0],
 ['

# Tokenizer for Neural Network

In [78]:
tokenizer = Tokenizer(lower=False, oov_token="<OOV>")

In [79]:
player_cleaning_df['preprocessed_titles'] = player_cleaning_df['title'].apply(lambda x: splitter(x))

In [80]:
tokenizer.fit_on_texts(player_cleaning_df['preprocessed_titles'])

In [81]:
tokenizer.word_index

{'<OOV>': 1,
 'starcraft2': 2,
 'vs': 3,
 'THE': 4,
 'highlights': 5,
 'best': 6,
 'of': 7,
 'zerg': 8,
 'OF': 9,
 'lowko': 10,
 'campaign': 11,
 'ep': 12,
 'A': 13,
 'ZERG': 14,
 'warcraft3': 15,
 'NEW': 16,
 '5': 17,
 'lowkotv': 18,
 'clem': 19,
 '2': 20,
 'serral': 21,
 'reforged': 22,
 'terran': 23,
 'GAME': 24,
 'frostpunk': 25,
 '3': 26,
 'reynor': 27,
 'game': 28,
 'PROTOSS': 29,
 'twitch': 30,
 'TERRAN': 31,
 'CHEESE': 32,
 'maru': 33,
 'dark': 34,
 'byun': 35,
 'starcraft': 36,
 'protoss': 37,
 'bly': 38,
 'cure': 39,
 'RUSH': 40,
 'STARCRAFT': 41,
 'live': 42,
 'VS': 43,
 'parting': 44,
 'zest': 45,
 'MASS': 46,
 'EPIC': 47,
 'showtime': 48,
 'IS': 49,
 'games': 50,
 'viewer': 51,
 'rush': 52,
 'IN': 53,
 'maxpax': 54,
 'BEST': 55,
 'INSANE': 56,
 'innovation': 57,
 'scarlett': 58,
 'trap': 59,
 'plays': 60,
 'new': 61,
 'SERRAL': 62,
 'TY': 63,
 'NYDUS': 64,
 'solar': 65,
 'base': 66,
 'BATTLECRUISER': 67,
 'PRO': 68,
 '1': 69,
 'rogue': 70,
 'harstem': 71,
 'match': 72,
 'b

In [82]:
len(tokenizer.word_index.keys())

1830

In [83]:
tokenizer.word_counts

OrderedDict([('starcraft2', 978),
             ('EPIC', 20),
             ('COMEBACKS', 1),
             ('scarlett', 16),
             ('vs', 418),
             ('byun', 29),
             ('maru', 32),
             ('LOVES', 2),
             ('CHEESING', 4),
             ('cure', 26),
             ('best', 75),
             ('of', 74),
             ('3', 39),
             ('can', 4),
             ('maxpax', 17),
             ('OVERTAKE', 1),
             ('showtime', 20),
             ('top', 2),
             ('position', 2),
             ('dark', 30),
             ('epic', 7),
             ('NEW', 48),
             ('micro', 10),
             ('trick', 1),
             ('MASSING', 5),
             ('ORACLES', 1),
             ('lambo', 11),
             ('hero', 7),
             ('WORST', 5),
             ('grand', 5),
             ('finals', 2),
             ('ever', 3),
             ('serral', 41),
             ('rogue', 14),
             ('lowko', 57),
             ('explains', 1)

In [84]:
sequences = tokenizer.texts_to_sequences(player_cleaning_df['preprocessed_titles'])
padded_sequences = pad_sequences(sequences, padding="post")

In [85]:
len(padded_sequences)

1252

In [86]:
padded_sequences.shape

(1252, 13)

In [87]:
padded_sequences[0]

array([  2,  47, 757,  58,   3,  35,   0,   0,   0,   0,   0,   0,   0])

In [88]:
player_cleaning_df['padded_sequences'] = padded_sequences.tolist()

In [89]:
player_cleaning_df['padded_sequences']

0            [2, 47, 757, 58, 3, 35, 0, 0, 0, 0, 0, 0, 0]
1          [2, 33, 465, 254, 39, 6, 7, 26, 0, 0, 0, 0, 0]
2       [2, 255, 54, 758, 48, 466, 467, 0, 0, 0, 0, 0, 0]
3       [2, 34, 160, 16, 107, 759, 35, 3, 34, 0, 0, 0, 0]
4          [2, 208, 760, 89, 3, 161, 0, 0, 0, 0, 0, 0, 0]
                              ...                        
1247        [2, 4, 185, 220, 737, 0, 0, 0, 0, 0, 0, 0, 0]
1248        [2, 719, 101, 4, 749, 0, 0, 0, 0, 0, 0, 0, 0]
1249           [2, 4, 14, 314, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1250      [2, 46, 190, 14, 610, 756, 0, 0, 0, 0, 0, 0, 0]
1251          [2, 753, 9, 754, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: padded_sequences, Length: 1252, dtype: object

In [90]:
padded_sequences

array([[  2,  47, 757, ...,   0,   0,   0],
       [  2,  33, 465, ...,   0,   0,   0],
       [  2, 255,  54, ...,   0,   0,   0],
       ...,
       [  2,   4,  14, ...,   0,   0,   0],
       [  2,  46, 190, ...,   0,   0,   0],
       [  2, 753,   9, ...,   0,   0,   0]])

# TF IDF Vectorizer

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [92]:
tf = TfidfVectorizer(preprocessor=splitter, lowercase=False)
fit_tf = tf.fit_transform(player_cleaning_df['title'])

In [93]:
tf_df = pd.DataFrame(data=fit_tf.todense(),columns=tf.get_feature_names())

In [94]:
tf_df['views'] = player_cleaning_df['views']

# Data Exports

In [97]:
# # Players_cleaning_df for use with visualizations
# with open(r'Data\players_cleaned_df.pickle', 'wb') as f:
#     pickle.dump(player_cleaning_df, f)

# # Tokenizer for decoding
# with open(r'Data\tokenizer.pickle','wb') as f1:
#     pickle.dump(tokenizer,f1)

# # 1gram - Trigram Tokenizers  grammed, one_words
# with open(r'Data\one_words.pickle','wb') as f2:
#     pickle.dump(one_words,f2)
# with open(r'Data\grammed.pickle','wb') as f3:
#     pickle.dump(grammed,f3)
# with open(r'Data\grammed_df.pickle','wb') as f3_2:
#     pickle.dump(grammed_df,f3_2)

# # Matchups
# with open(r'Data\matchup_df.pickle','wb') as f4:
#     pickle.dump(matchup_df,f4)

# # Games
# with open(r'Data\game_df.pickle','wb') as f5:
#     pickle.dump(game_df,f5)

# # StarCraft 2 races
# with open(r'Data\sc2races_df.pickle','wb') as f6:
#     pickle.dump(sc2races_df,f6)

# # Countries
# with open(r'Data\countries_df.pickle','wb') as f7:
#     pickle.dump(countries_df,f7)

# # Players
# with open(r'Data\player_df.pickle','wb') as f8:
#     pickle.dump(dropped_players,f8)

# # Padded Sequences Array
# with open(r'Data\padded_sequences.pickle','wb') as f9:
#     pickle.dump(padded_sequences,f9)


# # TFIDF Fit Vectorizer
# with open(r'Data\tfidf_fit.pickle','wb') as f10:
#     pickle.dump(tf,f10)

# # TFIDF Vectorizer Data
# with open(r'Data\tf_df.pickle','wb') as f11:
#     pickle.dump(tf_df,f11)