# Data Preparation for EDA and Modeling

In [1]:
# Imports
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.image as mpimg
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
import re
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

## Defining Processing Functions

In [2]:
# Preprocessor to help tokenize data
def splitter(input_message):
    message = input_message
    container = []
    
    # Identifies special numbers that we want to keep from the data
    # Usually involves computer parts (memory, GPU), numbers found in video game titles, or matchups (1 vs 1, 2 vs 2)
    comp_numbers = [2077,32,64,128,256,512,1024,1080,3080,60,144,2018,2019,2020,2021,2022,100,1,2,3,4]
    
    # specific player cleanup
    message = message.replace("dongraegu","drg")
    message = message.replace("dong rae gu",'drg')
    message = message.replace("rattata",'vanya')
    message = message.replace("liquid","")
    message = message.replace("dark templar","darktemplar")
    message = message.replace("dark shrine","darkshrine")

    # Obtains the first word before the column which is usually a video game title
    splitted = message.split(":")
    index = 0
    if ":" in message:
        container.append("".join(message.split(":")[0].split()))
        index = 1
    
    # Starts overall split of the data
    for word in message.split():
        if (index == 1) | (":" in word):
            index = 2
            continue
        # Removes punctuations and special characters
        all_upper = word.strip("!?()#&*")
        # Removes possessives
        all_upper = all_upper.replace("'s","").replace("'d","").replace("'","")

        # Removes numbers not in our accepted numbers list
        if all_upper.isdigit():
            if (int(all_upper) in comp_numbers):
                pass
            else:
                continue
        
        # Keeping words that are fully capitalized, otherwise, set word to lowercase
        if all_upper == all_upper.upper():
            container.append(all_upper)
        else:
            container.append(all_upper.lower())
    joined = " ".join(container)
    return joined

In [3]:
# Pulls out special words from predefined list
def get_from_list(message, special_list):
    container = []
    splitted = splitter(message)
    for word in splitted.split():
        if word.lower() in special_list:
            container.append(word.lower())
    return container

In [4]:
# Pulls out specific race or country info for players
def get_player_info(message,special_list,special_df, info=None):
    info_df = special_df.copy()
    info_df['tag'] = info_df['tag'].apply(lambda x: x.lower())
    container = []
    players = []
    splitted = splitter(message)
    if (info == 'race') | (info == 'country'):
        for word in splitted.split():
            if word.lower() in special_list:
                row = special_df.loc[info_df['tag'] == word.lower()]
                value = row[info].values[0]
                if word.lower() not in players:
                    container.append(value)
                    players.append(word.lower())
    # handles special cases
    if len(container)==3:
        container.pop(0)
    return container

In [5]:
# Extracts desired information from video stats
# Had to remove dislike count as they were removed from YouTube recently
def get_stats(video):
    vid_id = video['id']
    title = video['snippet']['title']
    date = video['snippet']['publishedAt']
    thumbnail = video['snippet']['thumbnails']['medium']['url']
    tags = video['snippet']['tags']
    views = video['statistics']['viewCount']
    likes = video['statistics']['likeCount']
    return {'vid_id':vid_id,'title':title,'date':date,'thumbnail':thumbnail,
            'tags':tags,'views':views,'likes':likes}

## Importing data pulled from APIs

In [6]:
all_stats = pickle.load(open(r"Data/all_stats.p","rb"))
players = pickle.load(open(r"Data/players_df.pickle","rb"))

In [7]:
# converting all_stats info a dataframe
main_df = pd.DataFrame.from_dict(all_stats).apply(lambda x: get_stats(x), axis=1).apply(pd.Series)

In [8]:
main_df['views'] = main_df['views'].astype(int)
main_df['likes'] = main_df['likes'].astype(int)

In [9]:
main_df.head()

Unnamed: 0,vid_id,title,date,thumbnail,tags,views,likes
0,LHDQ8iSbdck,StarCraft 2: EPIC COMEBACKS! (Scarlett vs ByuN),2021-12-21T19:00:14Z,https://i.ytimg.com/vi/LHDQ8iSbdck/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",8088,500
1,IM7OdVKFW4M,StarCraft 2: Maru LOVES CHEESING Cure! (Best-o...,2021-12-19T18:54:54Z,https://i.ytimg.com/vi/IM7OdVKFW4M/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",57301,1502
2,ReDXHIf6cuc,StarCraft 2: Can MaxPax OVERTAKE ShoWTimE's To...,2021-12-18T19:00:18Z,https://i.ytimg.com/vi/ReDXHIf6cuc/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",43247,1400
3,kWfuYGNq1C4,StarCraft 2: Dark's Epic NEW Micro Trick! (Byu...,2021-12-17T19:00:30Z,https://i.ytimg.com/vi/kWfuYGNq1C4/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",71067,2080
4,IUqQ_rVnSXg,StarCraft 2: MASSING ORACLES? (Lambo vs herO),2021-12-16T19:00:05Z,https://i.ytimg.com/vi/IUqQ_rVnSXg/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",69353,1839


In [10]:
# Only keeping the last 3 years of YouTube video data
current_df = main_df.loc[main_df['date']>='2018']

In [11]:
current_df

Unnamed: 0,vid_id,title,date,thumbnail,tags,views,likes
0,LHDQ8iSbdck,StarCraft 2: EPIC COMEBACKS! (Scarlett vs ByuN),2021-12-21T19:00:14Z,https://i.ytimg.com/vi/LHDQ8iSbdck/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",8088,500
1,IM7OdVKFW4M,StarCraft 2: Maru LOVES CHEESING Cure! (Best-o...,2021-12-19T18:54:54Z,https://i.ytimg.com/vi/IM7OdVKFW4M/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",57301,1502
2,ReDXHIf6cuc,StarCraft 2: Can MaxPax OVERTAKE ShoWTimE's To...,2021-12-18T19:00:18Z,https://i.ytimg.com/vi/ReDXHIf6cuc/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",43247,1400
3,kWfuYGNq1C4,StarCraft 2: Dark's Epic NEW Micro Trick! (Byu...,2021-12-17T19:00:30Z,https://i.ytimg.com/vi/kWfuYGNq1C4/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",71067,2080
4,IUqQ_rVnSXg,StarCraft 2: MASSING ORACLES? (Lambo vs herO),2021-12-16T19:00:05Z,https://i.ytimg.com/vi/IUqQ_rVnSXg/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",69353,1839
...,...,...,...,...,...,...,...
1247,2KJv_9IbrZ0,StarCraft 2: THE PLANETARY FORTRESS CONTAIN!,2018-01-06T21:00:00Z,https://i.ytimg.com/vi/2KJv_9IbrZ0/mqdefault.jpg,"[lowko, lowkotv, tutorial, guide, commentary, ...",228100,4394
1248,aOqlr2tN4Eo,StarCraft 2: ESCAPE ON THE HYPERION!,2018-01-05T21:00:00Z,https://i.ytimg.com/vi/aOqlr2tN4Eo/mqdefault.jpg,"[StarCraft 2, StarCraft 2: Wings of Liberty, W...",165132,4039
1249,Age4eNRUMrk,StarCraft 2: THE ZERG... MOTHERSHIP?!,2018-01-04T21:00:01Z,https://i.ytimg.com/vi/Age4eNRUMrk/mqdefault.jpg,"[lowko, lowkotv, tutorial, guide, commentary, ...",1783323,32859
1250,czuUoHltWpc,StarCraft 2: MASS SKY ZERG! (Triple Threat),2018-01-03T21:00:01Z,https://i.ytimg.com/vi/czuUoHltWpc/mqdefault.jpg,"[starcraft 2, starcraft 2 co-op, co-op mission...",60053,1143


## Cleaning the Player Data

In [12]:
# Converting string data to lowercase
players['country'] = players['country'].apply(lambda x: str(x).lower())
players['race'] = players['race'].apply(lambda x: str(x).lower())
players['tag'] = players['tag'].apply(lambda x: str(x).lower())

In [13]:
# Removing player names too similar to other commonly used words in titles
remove = ['jim','punk','thor','hyperion','probe','control','alpha','fenix','golden','fear','flood','strange',
          'terran','zerg','scv','nexus','reaper','meat','blink','chance','mechanics','wave','next','nice','zero',
          'shadow','raise','job','doctor','time','has','phoenix','raise','sortof','dns','keen',
          'cham','prototype','academy','ranger','blacksmith','faith','eternity','chase','crimson',
          'albion','fate','tears','coffee','monster','ready','hunter','ling','turn','master','risky']

In [14]:
dropped_players = players.loc[~players['tag'].isin(remove)]

In [15]:
dropped_players.head()

Unnamed: 0,country,id,race,resource_uri,tag
0,fi,485,z,/api/v1/player/485/,serral
1,kr,49,t,/api/v1/player/49/,maru
2,fr,5878,t,/api/v1/player/5878/,clem
3,it,5414,z,/api/v1/player/5414/,reynor
4,kr,76,z,/api/v1/player/76/,dark


In [16]:
player_cleaning_df = current_df.copy()

In [17]:
player_cleaning_df

Unnamed: 0,vid_id,title,date,thumbnail,tags,views,likes
0,LHDQ8iSbdck,StarCraft 2: EPIC COMEBACKS! (Scarlett vs ByuN),2021-12-21T19:00:14Z,https://i.ytimg.com/vi/LHDQ8iSbdck/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",8088,500
1,IM7OdVKFW4M,StarCraft 2: Maru LOVES CHEESING Cure! (Best-o...,2021-12-19T18:54:54Z,https://i.ytimg.com/vi/IM7OdVKFW4M/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",57301,1502
2,ReDXHIf6cuc,StarCraft 2: Can MaxPax OVERTAKE ShoWTimE's To...,2021-12-18T19:00:18Z,https://i.ytimg.com/vi/ReDXHIf6cuc/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",43247,1400
3,kWfuYGNq1C4,StarCraft 2: Dark's Epic NEW Micro Trick! (Byu...,2021-12-17T19:00:30Z,https://i.ytimg.com/vi/kWfuYGNq1C4/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",71067,2080
4,IUqQ_rVnSXg,StarCraft 2: MASSING ORACLES? (Lambo vs herO),2021-12-16T19:00:05Z,https://i.ytimg.com/vi/IUqQ_rVnSXg/mqdefault.jpg,"[Lowko, LowkoTV, Simon Heijnen, Tutorial, Guid...",69353,1839
...,...,...,...,...,...,...,...
1247,2KJv_9IbrZ0,StarCraft 2: THE PLANETARY FORTRESS CONTAIN!,2018-01-06T21:00:00Z,https://i.ytimg.com/vi/2KJv_9IbrZ0/mqdefault.jpg,"[lowko, lowkotv, tutorial, guide, commentary, ...",228100,4394
1248,aOqlr2tN4Eo,StarCraft 2: ESCAPE ON THE HYPERION!,2018-01-05T21:00:00Z,https://i.ytimg.com/vi/aOqlr2tN4Eo/mqdefault.jpg,"[StarCraft 2, StarCraft 2: Wings of Liberty, W...",165132,4039
1249,Age4eNRUMrk,StarCraft 2: THE ZERG... MOTHERSHIP?!,2018-01-04T21:00:01Z,https://i.ytimg.com/vi/Age4eNRUMrk/mqdefault.jpg,"[lowko, lowkotv, tutorial, guide, commentary, ...",1783323,32859
1250,czuUoHltWpc,StarCraft 2: MASS SKY ZERG! (Triple Threat),2018-01-03T21:00:01Z,https://i.ytimg.com/vi/czuUoHltWpc/mqdefault.jpg,"[starcraft 2, starcraft 2 co-op, co-op mission...",60053,1143


### Adding labels for players in title

In [18]:
player_list = dropped_players.tag.values

In [19]:
# Creates a column with a list of players found in title
player_cleaning_df['players'] = player_cleaning_df['title'].apply(lambda x: get_from_list(x,player_list))

In [20]:
# Creates a column for each player and add 1 if player is in title
for i in range(len(player_list)):
    for player in player_cleaning_df.iloc[i].players:
        player_cleaning_df.at[i,player]=1

### Finding average views and number of videos for each player

In [21]:
avg_views = []
count_views = []

# Find average views and number of videos, 0 if none
for player in player_list:
    try:
        avg_views.append(player_cleaning_df.loc[player_cleaning_df[player]==True].views.mean())
        count_views.append(player_cleaning_df.loc[player_cleaning_df[player]==True].views.count())
    except:
        avg_views.append(0)
        count_views.append(0)

In [22]:
# Add these values to the player info data frame
dropped_players['avg_views'] = avg_views
dropped_players['num_videos'] = count_views

In [23]:
dropped_players.head()

Unnamed: 0,country,id,race,resource_uri,tag,avg_views,num_videos
0,fi,485,z,/api/v1/player/485/,serral,157787.45283,53
1,kr,49,t,/api/v1/player/49/,maru,146143.527778,36
2,fr,5878,t,/api/v1/player/5878/,clem,142728.652174,46
3,it,5414,z,/api/v1/player/5414/,reynor,140566.318182,44
4,kr,76,z,/api/v1/player/76/,dark,129995.939394,33


## Getting Country Data

In [24]:
countries_list = np.array(list(set(dropped_players.country)))

In [25]:
countries_list[:5]

array(['au', 'in', 'my', 'gt', 'by'], dtype='<U4')

In [26]:
countries_list.astype(str)

array(['au', 'in', 'my', 'gt', 'by', 'hu', 'co', 'bg', 'it', 'sk', 'mm',
       'id', 'ar', 'dz', 'za', 'is', 'kr', 'uk', 'es', 'ie', 'fr', 'hk',
       'vn', 'cr', 'pl', 'pe', 'cz', 've', 'none', 'nz', 'nl', 'hr', 'cu',
       'no', 'bo', 'de', 'il', 'mx', 'lu', 'dk', 'tr', 'at', 'br', 'ba',
       'ro', 'cl', 'lv', 'ee', 'be', 'bd', 'ru', 'fi', 'ca', 'ch', 'lt',
       'ph', 'us', 'si', 'cn', 'sg', 'eg', 'jp', 'tw', 'se', 'ua', 'uz'],
      dtype='<U4')

In [27]:
# Adds list of countries associated with the player names found in titles
player_cleaning_df['country'] = player_cleaning_df['title'].apply(
                                lambda x: get_player_info(x,player_list,dropped_players,info='country'))

In [28]:
for i in range(len(player_cleaning_df)):
    for country in player_cleaning_df.iloc[i].country:
        player_cleaning_df.at[i,country]=1
player_cleaning_df.fillna(0,inplace=True)

In [29]:
avg_views_country = []
count_views_country = []

# Find average views and number of videos for countries
for country in countries_list:
    try:
        avg_views_country.append(player_cleaning_df.loc[player_cleaning_df[country]==1].views.mean())
        count_views_country.append(player_cleaning_df.loc[player_cleaning_df[country]==1].views.count())
    except:
        avg_views_country.append(0)
        count_views_country.append(0)

In [30]:
countries_df = pd.DataFrame(data=[countries_list])

In [31]:
countries_df = countries_df.T

In [32]:
countries_df['avg_views'] = avg_views_country
countries_df['num_videos'] = count_views_country

In [33]:
countries_df.columns = ['country_code','avg_views','num_videos']

In [34]:
countries_df

Unnamed: 0,country_code,avg_views,num_videos
0,au,0.000000,0
1,in,0.000000,0
2,my,0.000000,0
3,gt,0.000000,0
4,by,0.000000,0
...,...,...,...
61,jp,0.000000,0
62,tw,64856.000000,1
63,se,95252.500000,2
64,ua,145191.178571,28


In [36]:
# imports csv with country codes and names
ccodes = pd.read_csv(r'Data\country_codes.csv',encoding='latin-1')
ccodes['Code'] = ccodes['Code'].apply(lambda x: str(x).lower().strip())

In [37]:
country_zip = countries_df['country_code']

In [38]:
country_container = []
for country in country_zip:
    try:
        country_container.append(ccodes.loc[ccodes['Code']==country,'Country'].values[0])
    except:
        country_container.append("None")

In [39]:
countries_df['country'] = country_container

In [40]:
countries_df.head()

Unnamed: 0,country_code,avg_views,num_videos,country
0,au,0.0,0,Australia
1,in,0.0,0,India
2,my,0.0,0,Malaysia
3,gt,0.0,0,Guatemala
4,by,0.0,0,Belarus


## Getting SC2 Race Data

In [41]:
# Following similar logic to the countries, except the totals will have number of races instead of just a label

In [42]:
# Starcraft 2 races = Zerg, Protoss, Terran
sc2races = ['z','p','t']


In [43]:
# Adds list of countries associated with the player names found in titles
player_cleaning_df['sc2race'] = player_cleaning_df['title'].apply(
                                lambda x: get_player_info(x,player_list,dropped_players,info='race'))
player_cleaning_df['z'] = 0
player_cleaning_df['t'] = 0
player_cleaning_df['p'] = 0

In [44]:
player_cleaning_df['sc2race']

0       [z, t]
1       [t, t]
2       [p, p]
3       [z, t]
4       [z, p]
         ...  
1247        []
1248        []
1249        []
1250        []
1251        []
Name: sc2race, Length: 1252, dtype: object

In [45]:
for i in range(len(player_cleaning_df)):
    for race in player_cleaning_df.iloc[i].sc2race:
        try:
            player_cleaning_df.at[i,race]+=1
        except:
            player_cleaning_df.at[i,race]=1

In [46]:
avg_views_race = []
count_views_race = []

# Find average views and number of videos for sc2 races
for race in sc2races:
    try:
        avg_views_race.append(player_cleaning_df.loc[player_cleaning_df[race]>0].views.mean())
        count_views_race.append(player_cleaning_df.loc[player_cleaning_df[race]>0].views.count())
    except:
        avg_views_race.append(0)
        count_views_race.append(0)

In [47]:
sc2races_df = pd.DataFrame(data=[sc2races])
sc2races_df = sc2races_df.T

In [48]:
sc2races_df['avg_views'] = avg_views_race
sc2races_df['num_videos'] = count_views_race
sc2races_df.columns = ['race','avg_views','num_videos']

In [49]:
sc2races_df

Unnamed: 0,race,avg_views,num_videos
0,z,133687.225941,239
1,p,119526.383333,180
2,t,128016.71134,194


## Getting Matchup Data
- Focusing on just the 1 vs 1 matchups for this EDA, which is the most popular in this channel

In [50]:
# 6 combinations of matchups for 3 races
sc2matchups = ['ZvZ','ZvP','ZvT','TvT','TvP','PvP']

In [51]:
for matchup in sc2matchups:
    player_cleaning_df[matchup] = 0

In [52]:
# Assigns a label for each combination of matchup
for i in range(len(player_cleaning_df)):
    race = player_cleaning_df.iloc[i].sc2race
    if len(race) == 2:
        if ("z" in race) & ("t" in race):
            player_cleaning_df.at[i,'ZvT']=1
        elif ("z" in race) & ("p" in race):
            player_cleaning_df.at[i,'ZvP']=1
        elif ("t" in race) & ("p" in race):
            player_cleaning_df.at[i,'TvP']=1
        elif ("z" in race):
            player_cleaning_df.at[i,'ZvZ']=1
        elif ("t" in race):
            player_cleaning_df.at[i,'TvT']=1
        elif ("p" in race):
            player_cleaning_df.at[i,'PvP']=1

In [53]:
matchup_df = pd.DataFrame(data=[sc2matchups])
matchup_df = matchup_df.T

In [54]:
avg_views_matchup = []
count_views_matchup = []

# Find average views and number of videos for sc2 races
for matchup in sc2matchups:
    try:
        avg_views_matchup.append(player_cleaning_df.loc[player_cleaning_df[matchup]>0].views.mean())
        count_views_matchup.append(player_cleaning_df.loc[player_cleaning_df[matchup]>0].views.count())
    except:
        avg_views_matchup.append(0)
        count_views_matchup.append(0)

In [55]:
matchup_df['avg_views'] = avg_views_matchup
matchup_df['num_videos'] = count_views_matchup
matchup_df.columns = ['matchup','avg_views','num_videos']

In [56]:
matchup_df

Unnamed: 0,matchup,avg_views,num_videos
0,ZvZ,119621.0,23
1,ZvP,141852.568966,58
2,ZvT,148570.689655,87
3,TvT,93882.444444,18
4,TvP,115359.255319,47
5,PvP,71545.333333,18


## Getting Game Data
- Game titles are usually before the ":" in the titles

In [57]:
def game_split(message):
    if ":" in message:
        return ("".join(message.lower().split(":")[0].split()))

In [58]:
player_cleaning_df['game'] = player_cleaning_df['title'].apply(lambda x:game_split(x))

In [59]:
player_cleaning_df['game'].fillna("none",inplace=True)

In [60]:
player_cleaning_df['game'].value_counts()

starcraft2                   966
none                         119
warcraft3                     46
frostpunk                     40
ageofempires4                 11
starcraft2co-op               11
lowkovszelda                  11
starcraft                     10
lowkovssekiro                  8
theyarebillions                4
warcraft3reforged              3
underlords                     2
dotaunderlords                 2
thestanleyparable              1
mutantyearzero                 1
diablo4announcement            1
darksouls3                     1
worldofwarcraft                1
farcry5                        1
lowkoplays...starcraft         1
lowkoplaysworldofwarcraft      1
hearthstonebattlegrounds       1
worldofwarcraftrp              1
lowkovsaplaguetale             1
worldofwarcraftclassic         1
lowkovsabzu                    1
starcraft2co-opbrutal+6        1
hearthstone                    1
ironharvest                    1
tropico6                       1
arise     

In [61]:
# Cleaning game titles to consolidate some of the same games
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("starcraft2co-op","starcraft2"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("starcraft2brutal+6","starcraft2"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("warcraft3reforged","warcraft3"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("lowkovszelda","zelda"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("lowkoplays...starcraft","starcraft"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("dotaunderlords","underlords"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("worldofwarcraftrp","worldofwarcraft"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("lowkoplaysworldofwarcraft",
                                                                                  "worldofwarcraft"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("lowkovsabzu","sabzu"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("lowkovsaplaguetale","aplaguetale"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("lowkovssekiro","sekiro"))
player_cleaning_df['game'] = player_cleaning_df['game'].apply(lambda x: x.replace("hearthstonebattlegrounds",
                                                                                  "hearthstone"))

In [62]:
player_cleaning_df['game'].value_counts()

starcraft2                978
none                      119
warcraft3                  49
frostpunk                  40
starcraft                  11
ageofempires4              11
zelda                      11
sekiro                      8
theyarebillions             4
underlords                  4
worldofwarcraft             3
hearthstone                 2
aplaguetale                 1
diablo4announcement         1
mutantyearzero              1
darksouls3                  1
arise                       1
thestanleyparable           1
sabzu                       1
farcry5                     1
tropico6                    1
worldofwarcraftclassic      1
ironharvest                 1
fortnite                    1
Name: game, dtype: int64

In [63]:
game_df = player_cleaning_df[['game','views']].groupby('game').agg(['mean','count']).reset_index()
game_df.columns = ['game_title','avg_views','num_videos']

In [64]:
game_df

Unnamed: 0,game_title,avg_views,num_videos
0,ageofempires4,68345.363636,11
1,aplaguetale,30590.0,1
2,arise,17146.0,1
3,darksouls3,15482.0,1
4,diablo4announcement,74531.0,1
5,farcry5,23703.0,1
6,fortnite,18326.0,1
7,frostpunk,31125.125,40
8,hearthstone,20215.5,2
9,ironharvest,101742.0,1


# Title Preparation for Neural Network

In [65]:
# These stop words don't add much to the analysis.  Kept other common stop words that 
special_stops = ['of','OF','is','the','THE','by','BY','it','in','on','and','but','being','an','for','to','they','any','from',
                'then','some','you','your','their','as','about','out','with','his','hers','he','she','at','go']

In [66]:
# Using a CountVectorizer with the predefined splitter function and special stop words for processing
# Also tokenizing for tri-grams to catch matchups X vs Y
grammed = CountVectorizer(preprocessor=splitter,stop_words=special_stops,ngram_range=(1,3))
grammedcounter = grammed.fit_transform(player_cleaning_df['title'])

In [67]:
grammed.vocabulary_

{'StarCraft2': 4201,
 'EPIC': 1279,
 'COMEBACKS': 867,
 'scarlett': 8771,
 'vs': 9382,
 'byun': 6711,
 'StarCraft2 EPIC': 4420,
 'EPIC COMEBACKS': 1286,
 'COMEBACKS scarlett': 868,
 'scarlett vs': 8782,
 'vs byun': 9517,
 'StarCraft2 EPIC COMEBACKS': 4424,
 'EPIC COMEBACKS scarlett': 1287,
 'COMEBACKS scarlett vs': 869,
 'scarlett vs byun': 8784,
 'maru': 7926,
 'LOVES': 2418,
 'CHEESING': 806,
 'cure': 7004,
 'best': 6595,
 'StarCraft2 maru': 5351,
 'maru LOVES': 7932,
 'LOVES CHEESING': 2421,
 'CHEESING cure': 811,
 'cure best': 7007,
 'StarCraft2 maru LOVES': 5354,
 'maru LOVES CHEESING': 7933,
 'LOVES CHEESING cure': 2422,
 'CHEESING cure best': 812,
 'can': 6798,
 'maxpax': 8010,
 'OVERTAKE': 3238,
 'showtime': 8864,
 'top': 9215,
 'position': 8435,
 'StarCraft2 can': 5216,
 'can maxpax': 6799,
 'maxpax OVERTAKE': 8013,
 'OVERTAKE showtime': 3239,
 'showtime top': 8868,
 'top position': 9218,
 'StarCraft2 can maxpax': 5217,
 'can maxpax OVERTAKE': 6801,
 'maxpax OVERTAKE showtime'

In [68]:
len(grammed.vocabulary_)

9835

In [69]:
grammed_df = pd.DataFrame(data=grammedcounter.toarray(),columns=grammed.get_feature_names())

In [70]:
grammed_df.head()

Unnamed: 0,100,100 DRONES,100 DRONES maru,100 MUTALISKS,100 PLAYERS,100 PLAYERS ENTER,1000,1000 ZERGLINGS,1000 ZERGLINGS reynor,10TH,...,zest vs soo,zest vs special,zest vs stats,zest vs zoun,zest zerg,zest zerg vs,zoun,zvp,zvp INSANE,zvp INSANE neeb
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
grammed_df['views'] = player_cleaning_df['views']

In [72]:
grammed_df.head()

Unnamed: 0,100,100 DRONES,100 DRONES maru,100 MUTALISKS,100 PLAYERS,100 PLAYERS ENTER,1000,1000 ZERGLINGS,1000 ZERGLINGS reynor,10TH,...,zest vs special,zest vs stats,zest vs zoun,zest zerg,zest zerg vs,zoun,zvp,zvp INSANE,zvp INSANE neeb,views
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8088
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,57301
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,43247
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,71067
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,69353


# Data Exports

In [73]:
# Players_cleaning_df for use with visualizations
# with open(r'Data\players_cleaned_df.pickle', 'wb') as f:
#     pickle.dump(player_cleaning_df, f)