In [28]:
# Alfredo Medina PID: 6252242
# Machine Learning Project Evaluating Top 100 Streamers
# Short Description of Program : I had used RankSVM as my algorithm to evaluate the top
# 100 streamers from a dataset of 1000 streamers, I had gotten this dataset from Kaggle,
# but I had modified it slightly to have a category for giveaways. SVM was the best possible
# algorithm I could have used for this type of dataset since it had the highest accuracy compared to
# other algorithms that I had tested such as KNN. Accuracy, percision, recall, and the runtime were
# calculated in this program! As well as I had sorted the streamers from the 1000 to make a list of 100
# Using Average viewers, Stream time, Followers, Views gained, and Followers gained as the features of
# calculation.
# PLEASE NOTE : RUNTIME FOR THIS PROGRAM TAKES 60 - 70 SECONDS ~
# ENVIRONMENT USED : colab.research.google.com

import pandas as pd # Used to read our .csv file
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC # Enables for RankSVM to use different kernels (linear, poly.. etc)
# StandardScalar used for pre-processng, scaling features
from sklearn.preprocessing import StandardScaler, LabelEncoder # Encoder used mainly because our dataset had foreign symbols | makes non-numerical labels, numerical
from sklearn.metrics import precision_score, recall_score # Used to obtain our measurements
import time # used to set the runtime timer for our code

# Starting code with timer, to measure runtime.
def runtime():
  for i in range(100): i

# Recorder for our starting timer using library function
runtime_start = time.time()

# Used in order to read our dataset so we can evaluate it with our chosen model
data_twitch = pd.read_csv("twitchdata.csv") # 'twitchdata.csv' is our dataset (using pandas)

# Splitting our dataset features & targetting variable 'Language' is evaluated
feature = data_twitch.drop(columns = ['Channel']) # dropping the channel column
classtarget = data_twitch['Language'] # our class value | targetted value
# Figured out this code using the pandas library looking at the reference code

# We must add encoding to our code since we have been getting errors with foreign keys
# Used to remove example error : ValueError: could not convert string to float: 'German'
encoder = LabelEncoder()
feature_label_encoder = feature.apply(encoder.fit_transform)
# Using the .apply we basically just applys the fit_transform to very instance in feature

# splitting our data into 40% testing and 60% training (89% accuracy)
feature_train, feature_test, classtarget_train, classtarget_test = train_test_split (feature_label_encoder ,classtarget, test_size = 0.4, random_state = 42) # This is used to split our data into training and testing
# We use the sklearn.model_selection import train_test_split in order to split up our data for testing using SVM

# Scaling the features using the StandardScaler
scaling = StandardScaler()
x_train = scaling.fit_transform(feature_train) # Fit_transform is usually used on training data
y_test = scaling.transform(feature_test) # transform used on test data

# Using the SVM model, aplication! (Training)
svm_model = SVC(kernel = 'linear') # Used this line of code so I can switch between linear, poly, rbf, ect.
svm_model.fit(feature_train, classtarget_train) # By using .fit we are doing the process of training the model
# This is the approach we are taking using the RankSVM model to track the top 100 streamers from our database

# Sorting the top 100 out of 1000 streamers using different features as factors

# Used simple sorting function to obtain this list
sort = data_twitch.sort_values(by=['Average viewers', 'Watch time(Minutes)', 'Stream time(minutes)', 'Peak viewers', 'Followers', 'Followers gained','Views gained'], ascending=False)
# This sort list has all of the numerical features where they would go in order, taking all of these features into account when sorting the list of streamers

# Make the list of streamers = 100
streamer_list = sort.head(100) # Grabs 100 streamers from the list of 1000

# Using the sorted list we had made above we will print out only the streamer names + language + Average viewers
streamer_sort = streamer_list[['Channel', 'Language']] # grabs the 100 streamers we have on our list and only gets the channel, and language
streamer_sort = streamer_sort['Channel'] + ' -- ' + streamer_sort['Language'] # The printing statement that would be put into 'sorting'

for sorting in streamer_sort: # inputs the streamer sort into a variable called sorting
  print (sorting) # Allows for the list to be printed properly without any spacing issues

# The used the scikit-learn website in order to figure out how to get these measurements into code (references below)

# Printing our the accuracy for the RankSVM model
accuracy = svm_model.score(feature_test, classtarget_test) # This grabs the accuracy of the test for the features and targetted class RankSVM
print("\nAccuracy", accuracy) # Prints out the accuracy for output

# Before we can get the precision and recall scores we need to get the predicted target value
classtarget_prediction = svm_model.predict(feature_test) # This allows for us to predict the labels

# Getting the precision from the RankSVM Model
# Using the precision_score from the imports above we can get the precision of the SVM model (similar method to splitting the testing and training)
# Same structure as splitting, we had to input zero_division = 1 because as default it is 0, which would give us an error, true positives + false positives = 0
precision = precision_score(classtarget_prediction ,classtarget_test, average = 'weighted', zero_division=1) # The average 'weighted' = True Positives / (True Positives + False Positives)
print ("Precision :", precision) # Prints out the precision of our SVM model

# Gives us the recall score, same thing as the precision score but this time weighted = TruePositives / (True Positives + False Negatives)
recall = recall_score(classtarget_prediction, classtarget_test, average = 'weighted', zero_division = 1) # Same structure as precision_score and train_test_split functions
print("Recall :", recall) # Prints out the recall score

# Recorder for our timer using the end time (using the .time function)
runtime_finished = time.time()

# Calculating the runtime using the start time and finishing time
final_runtime = runtime_finished - runtime_start

# Print out the runtime for the program!
print("\nRuntime :", final_runtime ,"seconds")

# Before running program get pipes installed :
# !pip install pandas
# !pip install sklearn

# *REFERENCES I used in order to get certain portions of my code to work : *
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html
# https://scikit-learn.org/stable/modules/svm.html

# I used these in order to figure out how pieces of the code functions so
# I could implement it into my own.
# I had learned more about the RankSVM model from our week 5 module from class,
# as well as from my data mining course.

dota2ti -- English
dota2ti_ru -- Russian
auronplay -- Spanish
LCS -- English
Rubius -- Spanish
Asmongold -- English
LCK_Korea -- Korean
RocketLeague -- English
LCK -- English
KEEMSTAR -- English
MontanaBlack88 -- German
OverwatchLeague -- English
StarLadder_cs_en -- English
shroud -- English
Tfue -- English
Faker -- Korean
TheGrefg -- Spanish
LEC -- English
Fortnite -- English
xQcOW -- English
RiotGamesBrazil -- Portuguese
TimTheTatman -- English
summit1g -- English
AustinShow -- English
BLASTPremier -- English
TheRealKnossi -- German
EASPORTSFIFA -- English
NICKMERCS -- English
DrDisrespect -- English
ibai -- Spanish
loltyler1 -- English
DreamHackDota2_RU -- Russian
GarenaTW -- Chinese
NOBRU -- Portuguese
LIRIK -- English
Riot Games (riotgames) -- English
Call of Duty (callofduty) -- English
RiotGamesRU -- Russian
Arteezy -- English
sodapoppin -- English
alanzoka -- Portuguese
dota2mc_ru -- Russian
Squeezie -- French
Fextralife -- English
SLAKUN10 -- Spanish
Mongraal -- English
GRONKH