In [100]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [101]:
df=pd.read_excel('PassEventsForwardFootball.xlsx')

In [102]:
# --------------------------------------------------------- Observe
# there will be a lot of columns that are not informative( just have one unique value), then check how many here
def get_columns_with_one_unique_value(df):
    col_counts = df.nunique()
    cols_with_one_unique_value = col_counts[col_counts == 1]
    return list(cols_with_one_unique_value.index)
get_columns_with_one_unique_value(df)

# *********************************************************

# --------------------------------------------------------- Observe
def print_unique_value(df):
    for col in df:
        print("column name:",col)
        print(df[col].unique())
        print("---")
# print_unique_value(df)

# from the result, we could see that columns
# Type, x_pitchsize, y_pitchsize each have just one unique value->done
# Club has just one unique vale ['Team Forward Football']-> done


# isForward and isSucceeded are either True or False, then convert them to numerical value 0 and 1 in order to better process-> done
# Team is either ['Team Forward Football_1' 'Team Forward Football_2'], then convert them to numerical value 0 and 1 -> done
# Pass type ['Forward pass' 'Lateral pass' 'Backward pass']-> 0,1,2->done
# Pressure level -> ['Full Pressure' 'No Pressure' 'Limited Pressure']->done
# column name: Zone ['Attack' 'Defence' 'Mid field']->0,1,2
# column name: Playing direction_first half ['left' 'right']->done
# column name: Playing direction_second half ['left' 'right']->done

# matchDuration is written in minutes and also just have two values-> don't need to convert

# *********************************************************


In [103]:
# --------------------------------------------------------- Process features

# convert data type of caregorical columns to int for easier process in the future
df.isForward = df.isForward.replace({True: 1, False: 0})
df.isSucceeded = df.isSucceeded.replace({True: 1, False: 0})
df.Team=df.Team.replace({'Team Forward Football_1':0,'Team Forward Football_2':1}) # first team 1-> number 1,but seems that it's better to have classified value begin from 0
df['Pass type']=df['Pass type'].replace({"Forward pass":0,"Lateral pass":1,"Backward pass":2})
df['Pressure level']=df['Pressure level'].replace({"Full Pressure":2,"Limited Pressure":1,"No Pressure":0})
df['Zone']=df['Zone'].replace({'Attack':0,'Defence':1,'Mid field':2})
df['Playing direction_first half']=df['Playing direction_first half'].replace({'left':0,'right':1})
df['Playing direction_second half']=df['Playing direction_second half'].replace({'left':0,'right':1})

# *********************************************************

In [104]:
# --------------------------------------------------------- Add features

df['pass_x']=df["posX_passer"]-df["received_PosX"]
df['pass_y']=df["posY_passer"]-df["received_PosY"]

# *********************************************************

In [105]:
# --------------------------------------------------------- Observe

# evidence for using history of player
set1 = set(df[df.Team==0]['Player_id'])
set2= set(df[df.Team==1]['Player_id'])

print("set1: ",set1)
print("set2: ",set2)
overlap = set1.intersection(set2)
print("intersection:",overlap)

# *********************************************************

set1:  {95579, 95580, 95581, 95582, 95583, 95584, 95585, 95586, 95587, 95588, 95589, 95591, 95592, 95593, 95594, 95595, 95597, 95600, 95601, 95986, 95987, 95603, 95988, 95602}
set2:  {95617, 95618, 95624, 95581, 95582, 95583, 95584, 95585, 95586, 95587, 95588, 95589, 95591, 95592, 95593, 95594, 95597, 95600, 95601, 95602, 95987, 95988, 95986}
intersection: {95581, 95582, 95583, 95584, 95585, 95586, 95587, 95588, 95589, 95591, 95592, 95593, 95594, 95597, 95600, 95601, 95602, 95987, 95988, 95986}


In [106]:
# --------------------------------------------------------- Observe

# conclusion: from the sorted TimeStamp, I found that there are two matches rather than two teams in one match
# -> more obvious, day is different
# ->done
# Also, information is not so much to be periodic-> split method should not use timeseriessplit

df_team_one=df[df.Team==0]
df_team_two=df[df.Team==1]

# df_team_one.sort_values('TimeStamp') # check the starting and ending time of each match



In [107]:
# df_team_two.sort_values('TimeStamp')

# *********************************************************

In [108]:
# --------------------------------------------------------- Observe

# df.groupby(['Player_id']).size() 

# calculate this in order to make sure there is no player with just a single line data
# -> avoid when splitting dataset, there is no solution to split into training and test dataset
# -> the result is positive-> done

# *********************************************************

In [109]:
# --------------------------------------------------------- Observe

def print_column_name(df):
    for col in df:
        print("column name:",col)
# print_column_name(df)


# feature expansion
# 1. this is obvious that the distance between passer and receicver has the influence on if the pass is successful
# -> found that Pass length has been calculated->done
# -> but angle of start is also informative-> angle is also calculated->done
# 2. the difference between startTime and the start time have an influence on the physical strength-> done-> not informative, not peroidic->discard
# 3. if the exact position or the interval of position has an influence on the result-> generate features-> how to use it
#  * feature selection ( based on model, or correlation )
#  * binned feature and then feature selection
#  * alternative column: zone -> selected
# 4. note that former part is more team 1, later part is for team 2, then there is also the overlap timestamp of records, try to use time to generate features-> not correct
# time can also use Time block, or calculated time using minus -> select Time Block
# 5. time related features
# -> how many opponents appear in 5s in a exact scope-> don't have enough data
# -> how many friends appear in 5s or around 5s in the same field-> don't have enough data
# -> how many rival passes-> don't have enough data
# -> how many friends passes-> done
# -> how many rivals in a certain fields.-> don't have enough data
# -> how many friends in a certain fields.-> don't have enough data

# 5. Angle passe is for ridian, maybe degree is better -> not sure,but don't think so, just leave this idea here-> select ridian

# consider the success rate of pass of one player-> should done after split->done

# outlier?-> not suitable in this project

# *********************************************************

In [110]:
# --------------------------------------------------------- Add features
# How many friends passes in x second

df.set_index('TimeStamp', drop=True, inplace=True)
df = df.sort_index()
window=df['Zone'].rolling('10s')

def count_same_zone(x, current_player):
    return x[x == current_player].count()

df['player_num_in_same_zone'] = window.apply(lambda x: count_same_zone(x, x[0]), raw=False)
df=df.reset_index()
# *********************************************************

In [111]:
# --------------------------------------------------------- Observe: check missing values
def get_none_percent(df):
    return df.isna().sum()/df.shape[0]
def get_none_num(df):
    return df.isna().sum()

get_none_percent(df)
# *********************************************************

TimeStamp                        0.000000
Type                             0.000000
posX_passer                      0.000000
posY_passer                      0.000000
received_PosX                    0.000000
received_PosY                    0.000000
isForward                        0.000000
isSucceeded                      0.000000
receiverId                       0.285464
Player_id                        0.000000
Team                             0.000000
startTime                        0.000000
matchDuration                    0.000000
Club                             0.000000
Time block                       0.000000
Zone                             0.000000
Area Football Pitch              0.000000
Angle Passe                      0.000000
Pass type                        0.000000
Pass length                      0.005254
Playing direction_first half     0.000000
Playing direction_second half    0.000000
x_pitchsize                      0.000000
y_pitchsize                      0

In [112]:
# --------------------------------------------------------- Fill missing value of Pass length
# even the pass is not successful, then there is a expected sending point and a received point, so all the Pass length could be calculated
# -> the missing value of Pass length is solved-> done

def compute_lenght(df):
    import math
    length=np.sqrt((df['posX_passer']-df['received_PosX'])**2+(df['posY_passer']-df['received_PosY'])**2)
    return length
fill_series=compute_lenght(df)
df["Pass length"]=df["Pass length"].fillna(fill_series)
# *********************************************************

In [113]:
# --------------------------------------------------------- Observe
# found that there is no receivedId if isSucceed is false, check if they have the one-to-one relation
# -> the answer is yes, so receivedId is also the answer!!! we can't use it in training a model
# -> just delete it (receivedId)->done

1-df['isSucceeded'].sum()/df.shape[0]

# *********************************************************

0.2854640980735552

In [114]:
# --------------------------------------------------------- Observe
# considering about the imbalance of isSucceeded

# print("successful:",df['isSucceeded'].sum()/df.shape[0])
# print("failure:",1-df['isSucceeded'].sum()/df.shape[0])

# -> not so balanced
# -> so try to interpolate the minor class or reduce the major class or using another score metric *** different choice
# -> which is suitable? but imbalanced data is not series, so tring score metric, for example, f1 score->selected
# *********************************************************


In [115]:
# --------------------------------------------------------- Observe
# Define a custom function that returns the data type of a column

# def check_dtype(col):
#     return col.dtype
    
# Apply the custom function to each column of the DataFrame
# df.apply(check_dtype)

# *********************************************************

In [156]:
# --------------------------------------------------------- Add features
# generate bin values
num_bins = 4
df['angle_bins'] = pd.qcut(df['Angle Passe'], num_bins)

lst=df['angle_bins'].unique()
my_dic={}
my_dict = {}
for i, val in enumerate(lst):
    my_dict[val] = i

df.angle_bins = df.angle_bins.replace(my_dict).astype("int64")
# *********************************************************

In [158]:
# --------------------------------------------------------- Observe
import seaborn as sns
def get_distribution_of_each_column(df):
    # Loop over the features
    for col in df:
        # Select the feature
        feature = df[col]
        
        # Plot the distribution of the feature
        sns.histplot(feature)
        plt.show()

def get_distribution_columns(df,col_lst):
    # Loop over the features
    for col in col_lst:
        # Select the feature
        feature = df[col]
        
        # Plot the distribution of the feature
        sns.histplot(feature)
        plt.show()
# get_distribution_of_each_column(df) 
# found that Pass length and Distance to first oppoment have log distribution, also pair_count
# pair_count is necesseay for transformation???-> selected not
# *********************************************************

# --------------------------------------------------------- Turn distribution
# actually, this part should be put after all the features are generated, but I had done so, the result is the same
# -> considering the interface, it's convenient to put it here
from sklearn.preprocessing import QuantileTransformer
transformer = QuantileTransformer(output_distribution='normal')
df['Pass length'] = transformer.fit_transform(df['Pass length'].values.reshape(-1,1))
df['Distance to first opponent'] = transformer.fit_transform(df['Distance to first opponent'].values.reshape(-1,1))
# df['pair_count'] = transformer.fit_transform(df['pair_count'].values.reshape(-1,1))


# for validation
# get_distribution_columns(df,['Pass length','Distance to first opponent','pair_count']) # found that normal distribution has been transformed successfully
# *********************************************************

In [159]:
# --------------------------------------------------------- Data split
from sklearn.model_selection import train_test_split
df["receiverId"]=df["receiverId"].fillna(0) # a sign bit, in order to distinguish
X = df.loc[:, ~df.columns.isin(["isSucceeded"])]
y = df["isSucceeded"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df[['Player_id','isSucceeded']]) # former proven history variety, later proven target variety
df_train=pd.concat([X_train, y_train], axis=1)
df_test= pd.concat([X_test, y_test], axis=1)
# *********************************************************

In [160]:
# --------------------------------------------------------- Add features
# Note: the code segment is duplicate-> for future improvement !!!
# count pair (receiverId and playId)
def added_X_train_test_pair(df_train,X_train,X_test):
    data_dict=df_train.groupby(['receiverId','Player_id']).sum('isSucceeded')['isSucceeded'].to_dict()

    def use_lambda(df):
        import math
        if (df["receiverId"],df['Player_id']) in data_dict.keys() and df["receiverId"]!=0:
            return data_dict[(df["receiverId"],df['Player_id'])]
        return 0

    def add_pair_rate_train(df): 
        df['pair_count']=df.apply(use_lambda,axis=1)
        return df

    def add_pair_rate_test(df):
        df['pair_count']=df.apply(use_lambda,axis=1)
        return df

    return add_pair_rate_train(X_train),add_pair_rate_test(X_test)
    
X_train,X_test=added_X_train_test_pair(df_train,X_train,X_test)
# X_test.pair_count.sum() # result is more than 100, so a good idea at least

def add_count_player_id(X_train,X_test):
    pass_dict=df_train.groupby(['Player_id']).count()['posX_passer'].to_dict()
    def add_success_count_train(df): # note player_id and success_rate is one vs one, have no information, but will have information on the test data!!!
        df['succeed_count']=df['Player_id'].map(lambda x:pass_dict[x])
        return df
    def use_lambda(x):
        import math
        if x in pass_dict.keys():
            return pass_dict[x]
        return np.mean(list(pass_dict.values()))
    def add_success_rate_test(df):
        df['succeed_count']=df['Player_id'].map(lambda x:use_lambda(x))
        return df
    return add_success_count_train(X_train),add_success_rate_test(X_test)
    
X_train,X_test=add_count_player_id(X_train,X_test)

def add_success_rate(df_train,X_train,X_test):
    data_dict=df_train.groupby(['Player_id']).mean('isSucceeded')['isSucceeded'].to_dict()

    def add_success_rate_train(df): # note player_id and success_rate is one vs one, have no information, but will have information on the test data!!!
        df['succeed_rate']=df['Player_id'].map(lambda x:data_dict[x])
        return df
    
    def use_lambda(x):
        import math
        if x in data_dict.keys():
            return data_dict[x]
        return np.mean(list(data_dict.values()))
    def add_success_rate_test(df):
        df['succeed_rate']=df['Player_id'].map(lambda x:use_lambda(x))
        return df
    return add_success_rate_train(X_train),add_success_rate_test(X_test)
X_train,X_test=add_success_rate(df_train,X_train,X_test)

def add_zone_rate(df_train,X_train,X_test):
    # generate feature based on received_id and also zone
    zone_dict=df_train.groupby(['Player_id','Zone']).mean('isSucceeded')['isSucceeded'].to_dict()
    # df['zone_rate']=df[['Player_id','Zone']].map(lambda x:zone_dict[(x[0],x[1])]) this can't work, map can only used in series, use apply instead
    def use_zone_lambda_train(df):
        return zone_dict[(df["Player_id"],df['Zone'])]
    def add_zone_rate_train(df): # note player_id and success_rate is one vs one, have no information, but will have information on the test data!!!
        df['zone_rate']=df[['Player_id','Zone']].apply(use_zone_lambda_train,axis=1)
        return df

    def use_zone_lambda_test(df):
        import math
        if (df["Player_id"],df['Zone']) in zone_dict.keys():
            return zone_dict[(df["Player_id"],df['Zone'])]
        else:
            sub_dict = {key: value for key, value in zone_dict.items() if key[1]== df['Zone']}
            return np.mean(list(sub_dict.values()))
    def add_zone_rate_test(df):
        df['zone_rate']=df[['Player_id','Zone']].apply(use_zone_lambda_test,axis=1)
        return df
    return add_zone_rate_train(X_train),add_zone_rate_test(X_test)
X_train,X_test=add_zone_rate(df_train,X_train,X_test)

def add_pass_type_rate(df_train,X_train,X_test):
    pass_type_dict=df_train.groupby(['Player_id','Pass type']).mean('isSucceeded')['isSucceeded'].to_dict()

    def use_pass_type_lambda_train(df):
        return pass_type_dict[(df["Player_id"],df['Pass type'])]
    
    def add_pass_type_rate_train(df): 
        df['pass_type_rate']=df[['Player_id','Pass type']].apply(use_pass_type_lambda_train,axis=1)
        return df

    def use_pass_type_lambda_test(df):
        import math
        if (df["Player_id"],df['Pass type']) in pass_type_dict.keys():
            return pass_type_dict[(df["Player_id"],df['Pass type'])]
        else:
            sub_dict = {key: value for key, value in pass_type_dict.items() if key[1]== df['Pass type']}
            return np.mean(list(sub_dict.values()))

    def add_pass_type_rate_test(df):
        df['pass_type_rate']=df[['Player_id','Pass type']].apply(use_pass_type_lambda_test,axis=1)
        return df
    return add_pass_type_rate_train(X_train),add_pass_type_rate_test(X_test)

X_train,X_test=add_pass_type_rate(df_train,X_train,X_test)

def add_pressure_level_rate(df_train,X_train,X_test):
    pressure_level_dict=df_train.groupby(['Player_id','Pressure level']).mean('isSucceeded')['isSucceeded'].to_dict()

    def use_pressure_level_lambda_train(df):
        return pressure_level_dict[(df["Player_id"],df['Pressure level'])]
    
    def add_pressure_level_rate_train(df): 
        df['pressure_level_rate']=df[['Player_id','Pressure level']].apply(use_pressure_level_lambda_train,axis=1)
        return df

    def use_pressure_level_lambda_test(df):
        import math
        if (df["Player_id"],df['Pressure level']) in pressure_level_dict.keys():
            return pressure_level_dict[(df["Player_id"],df['Pressure level'])]
        else:
            sub_dict = {key: value for key, value in pressure_level_dict.items() if key[1]== df['Pressure level']}
            return np.mean(list(sub_dict.values()))

    def add_pressure_level_rate_test(df):
        df['pressure_level_rate']=df[['Player_id','Pressure level']].apply(use_pressure_level_lambda_test,axis=1)
        return df
    return add_pressure_level_rate_train(X_train),add_pressure_level_rate_test(X_test)

X_train,X_test=add_pressure_level_rate(df_train,X_train,X_test)

# *********************************************************

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pair_count']=df.apply(use_lambda,axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pair_count']=df.apply(use_lambda,axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['succeed_count']=df['Player_id'].map(lambda x:pass_dict[x])
A value is trying to be set on a copy of a slice fro

In [161]:
import copy
X_train_b=copy.deepcopy(X_train)
X_test_b=copy.deepcopy(X_test)


In [162]:
X_train=copy.deepcopy(X_train_b)
X_test=copy.deepcopy(X_test_b)

In [163]:
# --------------------------------------------------------- Drop uninformative columns
def drop_columns(df,lst_columns):
    return df.drop(columns=lst_columns,axis=1)
def drop_columns_with_one_unique_value(df):
    return drop_columns(df,get_columns_with_one_unique_value(df))

def get_columns_with_one_unique_value(df):
    col_counts = df.nunique()
    cols_with_one_unique_value = col_counts[col_counts == 1]
    return list(cols_with_one_unique_value.index)

lst_delete=['TimeStamp','startTime','Team','receiverId'] 
lst_one_value=get_columns_with_one_unique_value(df)
X_train=drop_columns(X_train,lst_delete+lst_one_value)
X_test=drop_columns(X_test,lst_delete+lst_one_value)
# print(lst_one_value)

# *********************************************************

In [164]:
# --------------------------------------------------------- Show added features are informative
def plot_corr(X_train,y_train):
    import seaborn as sns
    import copy
    df_plot=copy.deepcopy(X_train)
    df_plot['target'] = copy.deepcopy(y_train)
    corr = df_plot.corr()
    sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap="RdBu")
# plot_corr(X_train,y_train)

# *********************************************************

In [165]:
# --------------------------------------------------------- model training
from sklearn import metrics
def get_dummy_score(X_train, X_test, y_train, y_test):
    from sklearn.dummy import DummyClassifier


    # Create an instance of the DummyClassifier class
    dummy_classifier = DummyClassifier(strategy='most_frequent')

    # Fit the DummyClassifier instance to the training data
    dummy_classifier.fit(X_train, y_train)

    y_pred=dummy_classifier.predict(X_test)
    f1 = metrics.f1_score(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    print("F1 Score:", f1)
    print("Accuracy:", acc)
    print("Precision:", prec)
get_dummy_score(X_train, X_test, y_train, y_test)

# *********************************************************

F1 Score: 0.8373983739837398
Accuracy: 0.7202797202797203
Precision: 0.7202797202797203


In [170]:
# Import the necessary libraries and models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# insight: classifiers are just above the dummy model or even above, try to extract more features

# Define a dictionary of classification models
models = {
    "logistic_regression": LogisticRegression(),
    "support_vector_machine": SVC(),
    "k_nearest_neighbors": KNeighborsClassifier(),
    "decision_tree": DecisionTreeClassifier(),
    "random_forest": RandomForestClassifier(),
    "ada_boost": AdaBoostClassifier(),
    "gradient_boosting": GradientBoostingClassifier(),
    "xg_boost": XGBClassifier(),
    "bagging": BaggingClassifier(),
    "extra_trees": ExtraTreesClassifier(),
    "mlp": MLPClassifier(),
    "gaussian_process": GaussianProcessClassifier(),
    "quadratic_discriminant_analysis": QuadraticDiscriminantAnalysis()
}


In [171]:
from sklearn import metrics
def evaluate_models(model,name, X_train, X_test, y_train, y_test ):
  # Loop through each model
  # Fit the model and make predictions
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  # Calculate and print the metrics
  f1 = metrics.f1_score(y_test, y_pred)
  acc = metrics.accuracy_score(y_test, y_pred)
  prec = metrics.precision_score(y_test, y_pred)
  print(name)
  print("F1 Score:", f1)
  print("Accuracy:", acc)
  print("Precision:", prec)

In [173]:
# Loop through the models and evaluate each one
for name, model in models.items():
    evaluate_models(model,name,X_train, X_test, y_train, y_test)

# note: majority of models, the score is bad than dummy model, which means there are so many noise.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


logistic_regression
F1 Score: 0.850356294536817
Accuracy: 0.7797202797202797
Precision: 0.8325581395348837
support_vector_machine
F1 Score: 0.8373983739837398
Accuracy: 0.7202797202797203
Precision: 0.7202797202797203
k_nearest_neighbors
F1 Score: 0.8449438202247191
Accuracy: 0.7587412587412588
Precision: 0.7866108786610879
decision_tree
F1 Score: 0.9242819843342037
Accuracy: 0.8986013986013986
Precision: 1.0
random_forest
F1 Score: 0.9242819843342037
Accuracy: 0.8986013986013986
Precision: 1.0
ada_boost
F1 Score: 0.9242819843342037
Accuracy: 0.8986013986013986
Precision: 1.0
gradient_boosting
F1 Score: 0.9242819843342037
Accuracy: 0.8986013986013986
Precision: 1.0
xg_boost
F1 Score: 0.9242819843342037
Accuracy: 0.8986013986013986
Precision: 1.0
bagging
F1 Score: 0.9242819843342037
Accuracy: 0.8986013986013986
Precision: 1.0
extra_trees
F1 Score: 0.9309462915601023
Accuracy: 0.9055944055944056
Precision: 0.9837837837837838


  _warn_prf(average, modifier, msg_start, len(result))


mlp
F1 Score: 0.0
Accuracy: 0.27972027972027974
Precision: 0.0
gaussian_process
F1 Score: 0.7780548628428928
Accuracy: 0.6888111888111889
Precision: 0.8
quadratic_discriminant_analysis
F1 Score: 0.921875
Accuracy: 0.8951048951048951
Precision: 0.9943820224719101




In [181]:
# Define the parameter grid
param_grid = {
    'n_estimators': [600,610,620,590],
    'max_depth': [40,50],
}

from sklearn.model_selection import RandomizedSearchCV

# Create an instance of the Random Forest classifier
classifier = ExtraTreesClassifier()

# Create an instance of the RandomizedSearchCV class
random_search = RandomizedSearchCV(classifier, param_grid,cv=5,scoring='f1') 

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

print(random_search.best_params_)

classifier = random_search.best_estimator_
y_pred=classifier.predict(X_test)
# Calculate and print the metrics
f1 = metrics.f1_score(y_test, y_pred)
acc = metrics.accuracy_score(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred)
print("F1 Score:", f1)
print("Accuracy:", acc)
print("Precision:", prec)

# !!! conclusion: this kind of method is not correct, cv=ts_split, the best estimator would be the one tuning the parameter based on validation dataset,leading to overfitting, not generalization.
# ->wrong
# so note: maybe the model is too complex ?->discard this model->done
# ->wrong
# even through cv has test dataset, but still can overfit, becase overfit is the situation when strong model and less test score, less test data



{'n_estimators': 600, 'max_depth': 40}
F1 Score: 0.9151670951156812
Accuracy: 0.8846153846153846
Precision: 0.9726775956284153
