This notebook covers the cleaning, formatting and upload of the scraped informations

In [9]:
#importing necessary packages
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Boxscores


In [10]:
# loading the csv file to a dataframe
boxscores = pd.read_csv("../data/boxscore_full_notformatted.csv", sep=";")
boxscores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48617 entries, 0 to 48616
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   matchId  48617 non-null  int64  
 1   team     48603 non-null  object 
 2   name     48617 non-null  object 
 3   min      44465 non-null  object 
 4   2Ps      44465 non-null  object 
 5   3Ps      44465 non-null  object 
 6   FGs      44465 non-null  object 
 7   FTs      44465 non-null  object 
 8   Ps       44465 non-null  float64
 9   As       44465 non-null  float64
 10  Rs       48617 non-null  object 
 11  Fs       48617 non-null  int64  
 12  BLs      44466 non-null  float64
 13  STs      44466 non-null  float64
 14  TOs      48617 non-null  int64  
 15  EFs      48605 non-null  float64
dtypes: float64(5), int64(3), object(8)
memory usage: 5.9+ MB


In [11]:
#Creating a smaller testframe 
testbox = boxscores.head(20)
testbox

Unnamed: 0,matchId,team,name,min,2Ps,3Ps,FGs,FTs,Ps,As,Rs,Fs,BLs,STs,TOs,EFs
0,101059,Science City Jena,#5 Kendall Chones,15:44,2 - 3 - 67%,0 - 0 - 0%,2 - 3 - 67%,0 - 1 - 0%,4.0,0.0,3,4,1.0,1.0,1,6.0
1,101059,Science City Jena,#7 Ermen Reyes-Napoles,24:46,2 - 4 - 50%,1 - 1 - 100%,3 - 5 - 60%,2 - 2 - 100%,9.0,1.0,4,4,0.0,4.0,1,15.0
2,101059,Science City Jena,#8 Lukas Wank,00:00,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0.0,0.0,0,0,0.0,0.0,0,0.0
3,101059,Science City Jena,#9 Lars Wendt,24:51,2 - 2 - 100%,0 - 2 - 0%,2 - 4 - 50%,0 - 0 - 0%,4.0,1.0,1,1,0.0,0.0,4,0.0
4,101059,Science City Jena,#10 Sascha Leutloff,11:05,1 - 3 - 33%,0 - 0 - 0%,1 - 3 - 33%,2 - 2 - 100%,4.0,0.0,3,1,1.0,0.0,2,4.0
5,101059,Science City Jena,#13 Georg Wilhelm Voigtmann,11:34,2 - 2 - 100%,0 - 0 - 0%,2 - 2 - 100%,1 - 1 - 100%,5.0,0.0,0,2,1.0,0.0,0,6.0
6,101059,Science City Jena,#17 David Edward Hicks III,23:15,4 - 10 - 40%,0 - 2 - 0%,4 - 12 - 33%,2 - 2 - 100%,10.0,0.0,1,3,1.0,0.0,3,1.0
7,101059,Science City Jena,#20 Jan Heber,00:00,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0.0,0.0,0,0,0.0,0.0,0,0.0
8,101059,Science City Jena,#30 Brady Craig Morningstar,20:03,3 - 5 - 60%,1 - 5 - 20%,4 - 10 - 40%,2 - 2 - 100%,11.0,4.0,1,1,0.0,1.0,0,11.0
9,101059,Science City Jena,#33 Julius Wolf,19:48,2 - 3 - 67%,0 - 3 - 0%,2 - 6 - 33%,0 - 0 - 0%,4.0,1.0,4,3,1.0,0.0,3,3.0



## Filling null values
The data has null values in the following columns:  
* team
* min
* 2Ps
* 3Ps
* FGs
* FTs
* Ps
* As
* BLs
* STs
* TOs
* EFs

In [12]:
#defining default values
na_team = "undefined"
na_min = "00:00"
na_quota = "0 - 0 - 0%"
na_float = 0.0

#filling null values
testbox['team'] = testbox['team'].fillna(na_team)
testbox['min'] = testbox['min'].fillna(na_min)
testbox['2Ps'] = testbox['2Ps'].fillna(na_quota)
testbox['3Ps'] = testbox['3Ps'].fillna(na_quota)
testbox['FGs'] = testbox['FGs'].fillna(na_quota)
testbox['FTs'] = testbox['FTs'].fillna(na_quota)
testbox['Ps'] = testbox['Ps'].fillna(na_float)
testbox['As'] = testbox['As'].fillna(na_float)
testbox['STs'] = testbox['STs'].fillna(na_float)
testbox['BLs'] = testbox['BLs'].fillna(na_float)
testbox['TOs'] = testbox['TOs'].fillna(na_float)
testbox['EFs'] = testbox['EFs'].fillna(na_float)

testbox


Unnamed: 0,matchId,team,name,min,2Ps,3Ps,FGs,FTs,Ps,As,Rs,Fs,BLs,STs,TOs,EFs
0,101059,Science City Jena,#5 Kendall Chones,15:44,2 - 3 - 67%,0 - 0 - 0%,2 - 3 - 67%,0 - 1 - 0%,4.0,0.0,3,4,1.0,1.0,1,6.0
1,101059,Science City Jena,#7 Ermen Reyes-Napoles,24:46,2 - 4 - 50%,1 - 1 - 100%,3 - 5 - 60%,2 - 2 - 100%,9.0,1.0,4,4,0.0,4.0,1,15.0
2,101059,Science City Jena,#8 Lukas Wank,00:00,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0.0,0.0,0,0,0.0,0.0,0,0.0
3,101059,Science City Jena,#9 Lars Wendt,24:51,2 - 2 - 100%,0 - 2 - 0%,2 - 4 - 50%,0 - 0 - 0%,4.0,1.0,1,1,0.0,0.0,4,0.0
4,101059,Science City Jena,#10 Sascha Leutloff,11:05,1 - 3 - 33%,0 - 0 - 0%,1 - 3 - 33%,2 - 2 - 100%,4.0,0.0,3,1,1.0,0.0,2,4.0
5,101059,Science City Jena,#13 Georg Wilhelm Voigtmann,11:34,2 - 2 - 100%,0 - 0 - 0%,2 - 2 - 100%,1 - 1 - 100%,5.0,0.0,0,2,1.0,0.0,0,6.0
6,101059,Science City Jena,#17 David Edward Hicks III,23:15,4 - 10 - 40%,0 - 2 - 0%,4 - 12 - 33%,2 - 2 - 100%,10.0,0.0,1,3,1.0,0.0,3,1.0
7,101059,Science City Jena,#20 Jan Heber,00:00,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0.0,0.0,0,0,0.0,0.0,0,0.0
8,101059,Science City Jena,#30 Brady Craig Morningstar,20:03,3 - 5 - 60%,1 - 5 - 20%,4 - 10 - 40%,2 - 2 - 100%,11.0,4.0,1,1,0.0,1.0,0,11.0
9,101059,Science City Jena,#33 Julius Wolf,19:48,2 - 3 - 67%,0 - 3 - 0%,2 - 6 - 33%,0 - 0 - 0%,4.0,1.0,4,3,1.0,0.0,3,3.0


Based on this testcase we developed a function "fill_box_na" which can be applied to the dataframe of a boxscore:

In [13]:
from formatfunctions import fill_box_na
fill_box_na(boxscores)
boxscores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48617 entries, 0 to 48616
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   matchId  48617 non-null  int64  
 1   team     48617 non-null  object 
 2   name     48617 non-null  object 
 3   min      48617 non-null  object 
 4   2Ps      48617 non-null  object 
 5   3Ps      48617 non-null  object 
 6   FGs      48617 non-null  object 
 7   FTs      48617 non-null  object 
 8   Ps       48617 non-null  float64
 9   As       48617 non-null  float64
 10  Rs       48617 non-null  object 
 11  Fs       48617 non-null  int64  
 12  BLs      48617 non-null  float64
 13  STs      48617 non-null  float64
 14  TOs      48617 non-null  int64  
 15  EFs      48617 non-null  float64
dtypes: float64(5), int64(3), object(8)
memory usage: 5.9+ MB




## Formatting
The following columns have to be adjusted
* Names -> number + name in seperate columns
* Teamactions -> number = 999
* min as time ?
* 2P, 3P, FG , FT -> 1 column into 3 columns 
* %-values to decimals
* all other values as integers

### Names and Teamactions formatting

In [14]:
#Building number column
numbers = testbox['name'].apply(lambda x: 999 if x=="Team" else x[1:(x.find(" "))])
testbox.insert(loc=2, column="number",  value=numbers)

#name column
testbox['name'] = testbox['name'].apply(lambda x: "Team" if x=="Team" else x[(x.find(" ")):]) 

testbox.head(20)

Unnamed: 0,matchId,team,number,name,min,2Ps,3Ps,FGs,FTs,Ps,As,Rs,Fs,BLs,STs,TOs,EFs
0,101059,Science City Jena,5,Kendall Chones,15:44,2 - 3 - 67%,0 - 0 - 0%,2 - 3 - 67%,0 - 1 - 0%,4.0,0.0,3,4,1.0,1.0,1,6.0
1,101059,Science City Jena,7,Ermen Reyes-Napoles,24:46,2 - 4 - 50%,1 - 1 - 100%,3 - 5 - 60%,2 - 2 - 100%,9.0,1.0,4,4,0.0,4.0,1,15.0
2,101059,Science City Jena,8,Lukas Wank,00:00,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0.0,0.0,0,0,0.0,0.0,0,0.0
3,101059,Science City Jena,9,Lars Wendt,24:51,2 - 2 - 100%,0 - 2 - 0%,2 - 4 - 50%,0 - 0 - 0%,4.0,1.0,1,1,0.0,0.0,4,0.0
4,101059,Science City Jena,10,Sascha Leutloff,11:05,1 - 3 - 33%,0 - 0 - 0%,1 - 3 - 33%,2 - 2 - 100%,4.0,0.0,3,1,1.0,0.0,2,4.0
5,101059,Science City Jena,13,Georg Wilhelm Voigtmann,11:34,2 - 2 - 100%,0 - 0 - 0%,2 - 2 - 100%,1 - 1 - 100%,5.0,0.0,0,2,1.0,0.0,0,6.0
6,101059,Science City Jena,17,David Edward Hicks III,23:15,4 - 10 - 40%,0 - 2 - 0%,4 - 12 - 33%,2 - 2 - 100%,10.0,0.0,1,3,1.0,0.0,3,1.0
7,101059,Science City Jena,20,Jan Heber,00:00,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0 - 0 - 0%,0.0,0.0,0,0,0.0,0.0,0,0.0
8,101059,Science City Jena,30,Brady Craig Morningstar,20:03,3 - 5 - 60%,1 - 5 - 20%,4 - 10 - 40%,2 - 2 - 100%,11.0,4.0,1,1,0.0,1.0,0,11.0
9,101059,Science City Jena,33,Julius Wolf,19:48,2 - 3 - 67%,0 - 3 - 0%,2 - 6 - 33%,0 - 0 - 0%,4.0,1.0,4,3,1.0,0.0,3,3.0


### Quotas

The columns 2Ps, 3Ps, FGs and FTs currently have data in the following format: "X - Y - Z%"
Instead we want to divide each column in 3 seperate colums like 2P_P, 2P_A, 2P_R containing numerical values. 

"X - Y - Z%"  -> X   Y   0,Z

In [26]:
# getting the values in the  intended datatype
points = testbox['2Ps'].apply(lambda x : int(x.split(" - ")[0]))
attempts = testbox['2Ps'].apply(lambda x : int(x.split(" - ")[1]))
relative = testbox['2Ps'].apply(lambda x : float(x.split(" - ")[2][:-1])/100)


In [None]:
# extracting position of the column to be seperated
