In [10]:
import pandas as pd, numpy as np

df = pd.read_csv('data/ufc-master.csv')

# **EDA & cleaning**

In [11]:
# Let's see the shape and the dataset's beginning 
print('The shape of the dataset is:', df.shape)
df.head(5)

The shape of the dataset is: (6528, 118)


Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner,...,FinishDetails,FinishRound,FinishRoundTime,TotalFightTimeSecs,RedDecOdds,BlueDecOdds,RSubOdds,BSubOdds,RKOOdds,BKOOdds
0,Alexandre Pantoja,Kai Asakura,-250.0,215.0,40.0,215.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Rear Naked Choke,2.0,2:05,425.0,300.0,800.0,150.0,2500.0,400.0,350.0
1,Shavkat Rakhmonov,Ian Machado Garry,-210.0,295.0,47.619,295.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,,5.0,5:00,1500.0,250.0,650.0,180.0,3000.0,240.0,700.0
2,Ciryl Gane,Alexander Volkov,-380.0,300.0,26.3158,300.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,,3.0,5:00,900.0,-160.0,450.0,1100.0,3000.0,350.0,1100.0
3,Bryce Mitchell,Kron Gracie,-950.0,625.0,10.5263,625.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,...,Elbows,3.0,0:39,639.0,-200.0,1100.0,380.0,1400.0,500.0,4000.0
4,Nate Landwehr,Dooho Choi,-130.0,110.0,76.9231,110.0,2024-12-07,"Las Vegas, Nevada, USA",USA,Blue,...,Elbows,3.0,3:21,801.0,275.0,550.0,500.0,700.0,300.0,250.0


In [12]:
# Now the number of null values for each variable
missings = df.isnull().sum()
missings.sort_values(ascending = False, inplace = True)

print(dict(missings))
print('Total number of missing values: ', missings.sum())

{'BWFeatherweightRank': 6527, 'RWFeatherweightRank': 6519, 'BPFPRank': 6461, 'BWFlyweightRank': 6455, 'RWFlyweightRank': 6432, 'BWStrawweightRank': 6428, 'BWBantamweightRank': 6421, 'BBantamweightRank': 6409, 'BWelterweightRank': 6409, 'BLightHeavyweightRank': 6408, 'BLightweightRank': 6408, 'BFeatherweightRank': 6404, 'BFlyweightRank': 6398, 'BMiddleweightRank': 6391, 'RWStrawweightRank': 6382, 'BHeavyweightRank': 6380, 'RWBantamweightRank': 6374, 'RFeatherweightRank': 6351, 'RBantamweightRank': 6347, 'RMiddleweightRank': 6346, 'RLightHeavyweightRank': 6344, 'RLightweightRank': 6344, 'RHeavyweightRank': 6342, 'RFlyweightRank': 6340, 'RWelterweightRank': 6337, 'RPFPRank': 6275, 'BMatchWCRank': 5328, 'RMatchWCRank': 4749, 'FinishDetails': 3636, 'EmptyArena': 1486, 'BKOOdds': 1360, 'BSubOdds': 1359, 'RSubOdds': 1336, 'RKOOdds': 1334, 'BlueDecOdds': 1116, 'RedDecOdds': 1087, 'BlueAvgSigStrLanded': 930, 'BlueAvgTDPct': 842, 'BlueAvgTDLanded': 833, 'BlueAvgSubAtt': 832, 'BlueAvgSigStrPct': 

In [13]:
# Create a list of the columns which have more than 20% of missing values
columns_to_drop = list(missings[missings >= 0.20 * df.shape[0]].index)

# Drop the columns which have +20% of missing values
df.drop(columns_to_drop, axis = 1, inplace = True)

# Let's see the shape of the dataset after dropping the columns and the total number of null values
print('The new shape of the dataset is:', df.shape)
print('Now, we have', df.isnull().sum().sum(), 'missing values.')
print('Percentage of missing values: ', df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100)

The new shape of the dataset is: (6528, 84)
Now, we have 11311 missing values.
Percentage of missing values:  2.0627261321195145


In [14]:
# Fill the missing values with the mean of the column as there are only 2% of missing values
numerical_columns = df.select_dtypes(include = ['int64', 'float64']).columns
cathegorical_columns = df.select_dtypes(include = ['object']).columns

df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

# Let's see the number of missing values from each categorical columns and their values
cathegorical_missing = df[cathegorical_columns].isnull().sum()[df[cathegorical_columns].isnull().sum() > 0]
print('Number of missing values for each cathegorical variable: \n', cathegorical_missing, '\n')
print('Values of each cathegorical variable which has missings: \n', df[cathegorical_missing.index], '\n')

# Fill the cathegorical missing values from 'BlueStance' with the mode, but 'Finish' and 'FinishRoundTime' will be deleted as they are post-fight variables
df.fillna({'BlueStance': df['BlueStance'].mode()[0]}, inplace = True)
df.drop(['Finish', 'FinishRoundTime'], axis = 1, inplace = True)

# Let's see the number of missing values after filling them
print('Number of total missing values from the dataset after treating these variables: ', df.isnull().sum().sum())

Number of missing values for each cathegorical variable: 
 BlueStance           3
Finish             238
FinishRoundTime    622
dtype: int64 

Values of each cathegorical variable which has missings: 
      BlueStance  Finish FinishRoundTime
0      Orthodox     SUB            2:05
1      Orthodox   U-DEC            5:00
2      Orthodox   S-DEC            5:00
3      Southpaw  KO/TKO            0:39
4      Orthodox  KO/TKO            3:21
...         ...     ...             ...
6523   Orthodox  KO/TKO            0:44
6524   Southpaw  KO/TKO            2:01
6525   Orthodox  KO/TKO            0:47
6526   Orthodox   U-DEC            5:00
6527   Orthodox   U-DEC            5:00

[6528 rows x 3 columns] 

Number of total missing values from the dataset after treating these variables:  0


In [15]:
# Let's see the number of cathegorical variables and their values
print('Number of cathegorical variables: ', len(df.select_dtypes(include = ['object']).columns), ':', df.select_dtypes(include = ['object']).columns.tolist())
df.select_dtypes(include = ['object'])

Number of cathegorical variables:  11 : ['RedFighter', 'BlueFighter', 'Date', 'Location', 'Country', 'Winner', 'WeightClass', 'Gender', 'BlueStance', 'RedStance', 'BetterRank']


Unnamed: 0,RedFighter,BlueFighter,Date,Location,Country,Winner,WeightClass,Gender,BlueStance,RedStance,BetterRank
0,Alexandre Pantoja,Kai Asakura,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,Flyweight,MALE,Orthodox,Orthodox,Red
1,Shavkat Rakhmonov,Ian Machado Garry,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,Welterweight,MALE,Orthodox,Orthodox,Red
2,Ciryl Gane,Alexander Volkov,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,Heavyweight,MALE,Orthodox,Orthodox,Red
3,Bryce Mitchell,Kron Gracie,2024-12-07,"Las Vegas, Nevada, USA",USA,Red,Featherweight,MALE,Southpaw,Southpaw,Red
4,Nate Landwehr,Dooho Choi,2024-12-07,"Las Vegas, Nevada, USA",USA,Blue,Featherweight,MALE,Orthodox,Orthodox,neither
...,...,...,...,...,...,...,...,...,...,...,...
6523,Duane Ludwig,Darren Elkins,2010-03-21,"Broomfield, Colorado, USA",USA,Blue,Lightweight,MALE,Orthodox,Orthodox,neither
6524,John Howard,Daniel Roberts,2010-03-21,"Broomfield, Colorado, USA",USA,Red,Welterweight,MALE,Southpaw,Orthodox,neither
6525,Brendan Schaub,Chase Gormley,2010-03-21,"Broomfield, Colorado, USA",USA,Red,Heavyweight,MALE,Orthodox,Orthodox,neither
6526,Mike Pierce,Julio Paulino,2010-03-21,"Broomfield, Colorado, USA",USA,Red,Welterweight,MALE,Orthodox,Orthodox,neither


In [16]:
# Let's see the number of different values some of these variables could have
print('Number of different locations: ', df['Location'].nunique(), '\n')
print('Number of different countries: ', df['Country'].nunique(), 'which are:', df['Country'].unique(), '\n')
print('Number of different weight classes:', df['WeightClass'].nunique(), ':', df['WeightClass'].unique())

Number of different locations:  150 

Number of different countries:  36 which are: ['USA' 'China' 'Canada' 'United Arab Emirates' 'France' 'Australia'
 'United Kingdom' 'Saudi Arabia' 'Brazil' 'Mexico' 'Singapore' ' USA'
 ' New Zealand' ' South Korea' ' Brazil' ' Russia' ' Singapore'
 ' Australia' ' Denmark' ' Mexico' ' Canada' ' United Arab Emirates'
 ' China' ' Uruguay' ' Sweden' ' United Kingdom' ' Czech Republic'
 ' Argentina' ' Germany' ' Chile' ' Poland' ' Japan' ' Netherlands'
 ' Croatia' ' Ireland' ' Philippines'] 

Number of different weight classes: 13 : ['Flyweight' 'Welterweight' 'Heavyweight' 'Featherweight'
 'Light Heavyweight' 'Catch Weight' 'Lightweight' 'Bantamweight'
 "Women's Strawweight" "Women's Flyweight" 'Middleweight'
 "Women's Bantamweight" "Women's Featherweight"]


In [17]:
def country_to_continent(country):
    country_to_continent = {
    'USA': 'North America',
    'United States': 'North America',  # por si acaso
    'Canada': 'North America',
    'Mexico': 'North America',

    'Brazil': 'South America',
    'Argentina': 'South America',
    'Chile': 'South America',
    'Uruguay': 'South America',

    'United Kingdom': 'Europe',
    'France': 'Europe',
    'Germany': 'Europe',
    'Sweden': 'Europe',
    'Poland': 'Europe',
    'Czech Republic': 'Europe',
    'Ireland': 'Europe',
    'Croatia': 'Europe',
    'Denmark': 'Europe',
    'Netherlands': 'Europe',
    'Russia': 'Europe',

    'China': 'Asia',
    'Japan': 'Asia',
    'Singapore': 'Asia',
    'Philippines': 'Asia',
    'South Korea': 'Asia',

    'United Arab Emirates': 'Asia',
    'Saudi Arabia': 'Asia',

    'Australia': 'Oceania',
    'New Zealand': 'Oceania',
}
    return country_to_continent.get(country, 'Other')

def date_to_season(df):
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    # Add a column with the season of the year
    df['Season'] = df['Date'].dt.month % 12 // 3 + 1
    # Delete the Date column
    df.drop(columns=['Date'], inplace=True)

In [18]:
# We will need to treat each variable separately:
# - 'RedFighter' and 'BlueFighter' are the names of the fighters, so they are not useful for the model, but we will keep them to identify the fighters in the results

# - From 'Date' will be taken only the month to create a new variable with the season of the year, while 'Date' will be dropped.
# But before that, we need to know the index of the last fight in the training set, so we can separate the training and test sets by date when we export it.
df = df.sort_values(by = 'Date', ascending = True).reset_index(drop = True)	# Sort the dataset by date (olders first) and reset the index
df['Date'] = pd.to_datetime(df['Date'])
last_train_index = df[df['Date'].dt.year <= 2023].index.max()				# Get the index of the last fight in the training set (2023)
date_to_season(df)

# - 'Winner', as it is not the target variable, will be dropped as well, as it is not useful for the model
df['BlueWin'] = df['Winner'].map({'Red': 0, 'Blue': 1})
df.drop(['Winner'], axis = 1, inplace = True)

# - 'Location' has to be eliminated as it has too many different values, while 'Country' is going to be grouped into continents (i.e. 'Europe', 'America', 'Asia', 'Africa', 'Oceania') to reduce the number of possible values before doing one-hot encoding
df.drop(['Location'], axis = 1, inplace = True)
df['Continent'] = df['Country'].apply(country_to_continent)
df.drop(['Country'], axis = 1, inplace = True)
df = pd.get_dummies(df, columns = ['Continent'], drop_first = True, dtype = 'int64')


# - 'WeightClass' will be one-hot encoded even though it has 13 different values, as it could have a big importance in the fight outcome
df = pd.get_dummies(df, columns = ['WeightClass'], drop_first = True, dtype = 'int64')

# - 'Gender' will be one-hot encoded as well
df = pd.get_dummies(df, columns = ['Gender'], drop_first = True, dtype = 'int64')

# - 'RedStance' and 'BlueStance' are the stance of the fighters (only 2 possible values), so we will convert them to numerical values. The same for 'BetterRank' (3 possible values: 'Red', 'Blue', 'neither').
df = pd.get_dummies(df, columns = ['RedStance'], drop_first = True, dtype = 'int64')
df = pd.get_dummies(df, columns = ['BlueStance'], drop_first = True, dtype = 'int64')
df = pd.get_dummies(df, columns = ['BetterRank'], drop_first = True, dtype = 'int64')

print('Number of variables after preprocessing: ', df.shape[1])
print('Number of null values after preprocessing: ', df.isnull().sum().sum())
df.head(15)

Number of variables after preprocessing:  102
Number of null values after preprocessing:  0


Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,TitleBout,NumberOfRounds,BlueCurrentLoseStreak,BlueCurrentWinStreak,...,Gender_MALE,RedStance_Orthodox,RedStance_Southpaw,RedStance_Switch,BlueStance_Orthodox,BlueStance_Southpaw,BlueStance_Switch,BlueStance_Switch.1,BetterRank_Red,BetterRank_neither
0,Eric Schafer,Jason Brilz,140.0,-160.0,140.0,62.5,False,3,1,0,...,1,1,0,0,1,0,0,0,0,1
1,Brandon Vera,Jon Jones,215.0,-235.0,215.0,42.5532,False,3,1,0,...,1,1,0,0,1,0,0,0,0,1
2,Junior Dos Santos,Gabriel Gonzaga,-250.0,230.0,40.0,230.0,False,3,0,1,...,1,1,0,0,1,0,0,0,0,1
3,Cheick Kongo,Paul Buentello,-345.0,315.0,28.9855,315.0,False,3,1,0,...,1,1,0,0,1,0,0,0,0,1
4,Alessio Sakara,James Irvin,-120.0,100.0,83.3333,100.0,False,3,1,0,...,1,1,0,0,1,0,0,0,0,1
5,Mike Pierce,Julio Paulino,-420.0,335.0,23.8095,335.0,False,3,0,0,...,1,1,0,0,1,0,0,0,0,1
6,Eliot Marshall,Vladimir Matyushenko,145.0,-165.0,145.0,60.6061,False,3,0,1,...,1,1,0,0,1,0,0,0,0,1
7,Brendan Schaub,Chase Gormley,-260.0,220.0,38.4615,220.0,False,3,1,0,...,1,1,0,0,1,0,0,0,0,1
8,Clay Guida,Shannon Gugerty,-420.0,335.0,23.8095,335.0,False,3,1,0,...,1,1,0,0,1,0,0,0,0,1
9,Duane Ludwig,Darren Elkins,-155.0,135.0,64.5161,135.0,False,3,0,0,...,1,1,0,0,1,0,0,0,0,1


In [19]:
# We will drop some variables which are not needed in the model

# 'RedOdds' and 'BlueOdds' give us the same information as 'RedExpectedValue' and 'BlueExpectedValue', so we will drop them
df.drop(['RedOdds', 'BlueOdds'], axis = 1, inplace = True)

# 'FinishRound' and 'TotalFightTimeSecs' are post-fight variables, so we will drop them as well
df.drop(['FinishRound', 'TotalFightTimeSecs'], axis = 1, inplace = True)

# 'RedFighter' and 'BlueFighter' are the names of the fighters, so they are not useful for the model
df.drop(['RedFighter', 'BlueFighter'], axis = 1, inplace = True)

# All the variables that are the difference between the two fighters will be dropped as well, as they are not useful for the model
diff_columns = [col for col in df.columns if 'Dif' in col]
df.drop(diff_columns, axis = 1, inplace = True)

print(df.shape[1], 'variables: ', list(df.columns))

81 variables:  ['RedExpectedValue', 'BlueExpectedValue', 'TitleBout', 'NumberOfRounds', 'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws', 'BlueAvgSigStrLanded', 'BlueAvgSigStrPct', 'BlueAvgSubAtt', 'BlueAvgTDLanded', 'BlueAvgTDPct', 'BlueLongestWinStreak', 'BlueLosses', 'BlueTotalRoundsFought', 'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority', 'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous', 'BlueWinsByKO', 'BlueWinsBySubmission', 'BlueWinsByTKODoctorStoppage', 'BlueWins', 'BlueHeightCms', 'BlueReachCms', 'BlueWeightLbs', 'RedCurrentLoseStreak', 'RedCurrentWinStreak', 'RedDraws', 'RedAvgSigStrLanded', 'RedAvgSigStrPct', 'RedAvgSubAtt', 'RedAvgTDLanded', 'RedAvgTDPct', 'RedLongestWinStreak', 'RedLosses', 'RedTotalRoundsFought', 'RedTotalTitleBouts', 'RedWinsByDecisionMajority', 'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous', 'RedWinsByKO', 'RedWinsBySubmission', 'RedWinsByTKODoctorStoppage', 'RedWins', 'RedHeightCms', 'RedReachCms', 'RedWeightLbs', 'RedAg

In [20]:
# We almost have the dataset ready to train the model, but we need to check all the variables are numerical
print('Number of numerical variables: ', len(df.select_dtypes(include = ['int64', 'float64', 'int32']).columns))
print('Number of variables:', len(df.columns))

# We then have 2 variables which are not numerical, let's see them
print('Non-numerical variables: ', df.select_dtypes(exclude = ['int64', 'float64', 'int32']).columns.tolist())

# 'TitleBout' is a boolean variable, so we will convert it to 0 and 1
df['TitleBout'] = df['TitleBout'].map({True: 1, False: 0})

# Let's check all the variables are numerical now
print('Number of numerical variables after preprocessing: ', len(df.select_dtypes(include = ['int64', 'float64', 'int32']).columns))

Number of numerical variables:  80
Number of variables: 81
Non-numerical variables:  ['TitleBout']
Number of numerical variables after preprocessing:  81


In [21]:
# Let's see the number of fights for each gender, as well as the number of fights which are title bouts
print('Number of male fights:', df['Gender_MALE'].sum())
print('Number of female fights:', list(df.shape)[0] - df['Gender_MALE'].sum())
print('Number of title bouts fights:', df['TitleBout'].sum())
print('Number of non-title bouts fights:', list(df.shape)[0] - df['TitleBout'].sum())

Number of male fights: 5727
Number of female fights: 801
Number of title bouts fights: 302
Number of non-title bouts fights: 6226


# **Key transformation for the target variable**

### 🧠 Understanding and Normalizing Betting Probabilities

The **expected values** $e_x$ where $x$ represents the color ($b$ for blue, $r$ for red) represent the profit one would earn from a \$100 bet. Therefore, the **decimal odds** $d_x$ can be computed as:

$
d_x = \dfrac{e_x}{100} + 1 = \dfrac{e_x + 100}{100}
$

The **implied probability** $q_x$ given by the betting house is the inverse of these odds:

$
q_x = \dfrac{1}{d_x} = \dfrac{100}{e_x + 100}
$

However, betting houses **intentionally inflate** the total implied probabilities to ensure profit (a practice known as the **overround**):

$
q_b + q_r > 1
$

To normalize these and ensure their probabilities $p_x$ sum to 1, we apply:

$
p_b = \dfrac{q_b}{q_b + q_r} = \dfrac{\dfrac{100}{e_b + 100}}{\dfrac{100}{e_b + 100} + \dfrac{100}{e_r + 100}} = \dfrac{\dfrac{100}{e_b + 100}}{\dfrac{100 \cdot (e_r + e_b + 200)}{(e_b + 100)(e_r + 100)}} = \dfrac{e_r + 100}{e_r + e_b + 200}
$

Then:

$
p_r = 1 - p_b
$

This normalization ensures fair, interpretable probabilities to compare against model predictions.

In [22]:
# Now, let's apply the function explained before to create a new column 'Prob_BlueWin' which will be the probability of the blue fighter to win the fight
df['Prob_BlueWin_Bet'] = (df['RedExpectedValue'] + 100) / (df['RedExpectedValue'] + df['BlueExpectedValue'] + 200)

# Save the following columns as they are needed to calculate the results of the model
df.iloc[last_train_index:, :][['RedExpectedValue', 'BlueExpectedValue']].to_csv('testing_odds.csv', index = False)

# Then, let's drop the 'RedExpectedValue' and 'BlueExpectedValue' columns as they are not needed anymore, as well as 'RedDecOdds' and 'BlueDecOdds' which are not needed for the model
df.drop(['RedExpectedValue', 'BlueExpectedValue', 'RedDecOdds', 'BlueDecOdds'], axis = 1, inplace = True)

In [23]:
print('Number of variables after preprocessing: ', df.shape[1], 'variables: ', list(df.columns))

df.head()

Number of variables after preprocessing:  78 variables:  ['TitleBout', 'NumberOfRounds', 'BlueCurrentLoseStreak', 'BlueCurrentWinStreak', 'BlueDraws', 'BlueAvgSigStrLanded', 'BlueAvgSigStrPct', 'BlueAvgSubAtt', 'BlueAvgTDLanded', 'BlueAvgTDPct', 'BlueLongestWinStreak', 'BlueLosses', 'BlueTotalRoundsFought', 'BlueTotalTitleBouts', 'BlueWinsByDecisionMajority', 'BlueWinsByDecisionSplit', 'BlueWinsByDecisionUnanimous', 'BlueWinsByKO', 'BlueWinsBySubmission', 'BlueWinsByTKODoctorStoppage', 'BlueWins', 'BlueHeightCms', 'BlueReachCms', 'BlueWeightLbs', 'RedCurrentLoseStreak', 'RedCurrentWinStreak', 'RedDraws', 'RedAvgSigStrLanded', 'RedAvgSigStrPct', 'RedAvgSubAtt', 'RedAvgTDLanded', 'RedAvgTDPct', 'RedLongestWinStreak', 'RedLosses', 'RedTotalRoundsFought', 'RedTotalTitleBouts', 'RedWinsByDecisionMajority', 'RedWinsByDecisionSplit', 'RedWinsByDecisionUnanimous', 'RedWinsByKO', 'RedWinsBySubmission', 'RedWinsByTKODoctorStoppage', 'RedWins', 'RedHeightCms', 'RedReachCms', 'RedWeightLbs', 'RedA

Unnamed: 0,TitleBout,NumberOfRounds,BlueCurrentLoseStreak,BlueCurrentWinStreak,BlueDraws,BlueAvgSigStrLanded,BlueAvgSigStrPct,BlueAvgSubAtt,BlueAvgTDLanded,BlueAvgTDPct,...,RedStance_Orthodox,RedStance_Southpaw,RedStance_Switch,BlueStance_Orthodox,BlueStance_Southpaw,BlueStance_Switch,BlueStance_Switch.1,BetterRank_Red,BetterRank_neither,Prob_BlueWin_Bet
0,0,3,1,0,0,31.6667,0.46,0.6667,1.6667,0.5,...,1,0,0,1,0,0,0,0,1,0.596273
1,0,3,1,0,0,39.0,0.478,0.25,2.5,0.525,...,1,0,0,1,0,0,0,0,1,0.688445
2,0,3,0,1,0,13.7,0.548,0.7,1.2,0.572,...,1,0,0,1,0,0,0,0,1,0.297872
3,0,3,1,0,0,25.8,0.466,0.4,0.2,0.1,...,1,0,0,1,0,0,0,0,1,0.237112
4,0,3,1,0,0,5.25,0.555,0.125,0.0,0.0,...,1,0,0,1,0,0,0,0,1,0.478261


# **Exporting the dataset**

In [24]:
print(last_train_index)

df[:last_train_index + 1].to_csv('training_data.csv', index = False)
df[last_train_index + 1:].to_csv('testing_data.csv', index = False)

6027
