# Eliminate Highly correalted values


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/result_df_f (1).csv')

print(f"Total columns in DataFrame: {df.shape[1]}")
# Define round identifiers
round_identifiers = ['Round 1', 'Round 2', 'Round 3', 'Round 4', 'Round 5']

# Filter out columns that contain any of the round identifiers
round_cols = [col for col in df.columns if any(r in col for r in round_identifiers)]

# Drop those columns
df = df.drop(columns=round_cols)

print(f"Removed {len(round_cols)} round-wise columns.")



# 1. Create target variable
df['Is_Winner'] = ((df['Fighter First Name'] == df['Winner First Name']) &
                  (df['Fighter Last Name'] == df['Winner Last Name'])).astype(int)
features = df.columns


# 2. Select ALL numeric features (including your generated stats)
all_numeric_features = df.select_dtypes(include=np.number).columns.tolist()
all_numeric_features = [f for f in all_numeric_features
                       if f not in ['Is_Winner', 'Fight ID']]

print(f"Original numeric features: {len(all_numeric_features)}")

# 3. Smarter correlation filtering (keep more features)
def remove_highly_correlated(df, features, threshold=0.85):  # Increased threshold
    corr_matrix = df[features].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Keep feature with higher variance in correlated pairs
    to_keep = []
    to_drop = []
    for col in upper.columns:
        if col not in to_drop:
            correlated_cols = upper.index[upper[col] > threshold].tolist()

            if correlated_cols:
                # Keep the feature with highest variance
                variances = df[correlated_cols].var()
                keeper = variances.idxmax()
                to_keep.append(keeper)
                to_drop.extend([c for c in correlated_cols if c != keeper])

    return [f for f in features if f not in to_drop]

filtered_features = remove_highly_correlated(df, all_numeric_features)
print(f"After correlation filter: {len(filtered_features)}")
print(filtered_features)




  df = pd.read_csv('/content/result_df_f (1).csv')


Total columns in DataFrame: 515
Removed 90 round-wise columns.
Original numeric features: 411
After correlation filter: 241
['Winning Round', 'Height Feet', 'Height Inches', 'Weight Pounds', 'Reach Inches', 'Knockdown Total', 'Significant Strike Total Attempted', 'Takedown Total Attempted', 'Takedown Total Landed', 'Submission Attempted', 'Reversal', 'Significant Strike Head Landed', 'Significant Strike Body Attempted', 'Significant Strike Body Landed', 'Significant Strike Leg Attempted', 'Significant Strike Leg Landed', 'Significant Strike Clinch Attempted', 'Significant Strike Clinch Landed', 'Significant Strike Ground Attempted', 'Significant Strike Ground Landed', 'odds', 'Winning Time_Seconds', 'Ground and Cage Control Time_Seconds', 'Avg_Significant Strike Total Attempted_Wins', 'Significant Strike Total Attempted_Losses', 'Avg_Significant Strike Total Attempted_Losses', 'Significant Strike Total Attempted_Wins_KO_TKO', 'Avg_Significant Strike Total Attempted_Wins_KO_TKO', 'Signi

#Forward Wrapper

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler

# 1. Pre-scale data for faster convergence
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[filtered_features])

# 2. Use lighter RandomForest config
model = RandomForestClassifier(
    n_estimators=30,  # Reduced from 50
    max_depth=5,      # Shallower trees
    n_jobs=-1         # Use all CPU cores
)

# 3. Configure SFS with early stopping
forward_selector = SequentialFeatureSelector(
    model,
    n_features_to_select=15,  # Fixed number for faster completion
    direction='forward',
    cv=3,                     # Fewer folds
    scoring='accuracy',
    n_jobs=-1                 # Parallelize folds
)

forward_selector.fit(X_scaled, df['Is_Winner'])

# Get selected features
forward_selected_features = np.array(filtered_features)[forward_selector.get_support()]
print("Forward-selected features:", forward_selected_features)

Forward-selected features: ['Reach Inches' 'Takedown Total Landed'
 'Significant Strike Body Attempted' 'Significant Strike Clinch Attempted'
 'odds' 'Avg_Takedown Total Attempted_Losses_KO_TKO'
 'Takedown Total Landed_Losses' 'Reversal_Wins_SUB'
 'Avg_Ground and Cage Control Time_Seconds_Losses_U_DEC'
 'Avg_Significant Strike Leg Landed_Wins_KO_TKO'
 'Significant Strike Clinch Attempted_Wins_KO_TKO'
 'Avg_Significant Strike Clinch Landed_Losses'
 'Avg_Significant Strike Ground Landed_Losses'
 'Avg_Winning Time_Seconds_Wins' 'Winning Time_Seconds_CNC']


#Backward Wrapper

In [None]:
from sklearn.feature_selection import RFE  # Faster implementation
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# 1. Pre-scale data for faster convergence
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[filtered_features])


# 1. Use Recursive Feature Elimination (RFE)
model = RandomForestClassifier(n_estimators=30, max_depth=5, n_jobs=-1)

backward_selector = RFE(
    estimator=model,
    n_features_to_select=15,  # Target 15 best features
    step=10,  # Drop 10 worst features per iteration (speeds up dramatically)
    verbose=1  # Show progress
)

# 2. Fit on pre-scaled data
backward_selector.fit(X_scaled, df['Is_Winner'])

# 3. Get results
backward_selected_features = np.array(filtered_features)[backward_selector.support_]
print("Backward-selected features:", backward_selected_features)
# Check what's really being passed to selector


Fitting estimator with 241 features.
Fitting estimator with 231 features.
Fitting estimator with 221 features.
Fitting estimator with 211 features.
Fitting estimator with 201 features.
Fitting estimator with 191 features.
Fitting estimator with 181 features.
Fitting estimator with 171 features.
Fitting estimator with 161 features.
Fitting estimator with 151 features.
Fitting estimator with 141 features.
Fitting estimator with 131 features.
Fitting estimator with 121 features.
Fitting estimator with 111 features.
Fitting estimator with 101 features.
Fitting estimator with 91 features.
Fitting estimator with 81 features.
Fitting estimator with 71 features.
Fitting estimator with 61 features.
Fitting estimator with 51 features.
Fitting estimator with 41 features.
Fitting estimator with 31 features.
Fitting estimator with 21 features.
Backward-selected features: ['Reach Inches' 'Takedown Total Attempted' 'Takedown Total Landed'
 'Significant Strike Head Landed' 'Significant Strike Body Lan

#Trial and Error

In [None]:
from itertools import combinations
from sklearn.model_selection import cross_val_score

# Define candidate feature subsets
candidate_subsets = [
    ['Takedown Total Landed', 'Significant Strike Total Landed'],
    ['Ground and Cage Control Time_Seconds', 'Reach Inches'],
    forward_selected_features.tolist(),
    backward_selected_features.tolist()
]

# Test each subset
best_score = 0
best_subset = None
for subset in candidate_subsets:
    score = cross_val_score(
        model,
        df[subset],
        df['Is_Winner'],
        cv=5,
        scoring='accuracy'
    ).mean()
    print(f"Subset: {subset} | Accuracy: {score:.4f}")
    if score > best_score:
        best_score = score
        best_subset = subset

print("\nBest subset:", best_subset, "| Accuracy:", best_score)

Subset: ['Takedown Total Landed', 'Significant Strike Total Landed'] | Accuracy: 0.5325
Subset: ['Ground and Cage Control Time_Seconds', 'Reach Inches'] | Accuracy: 0.5440
Subset: ['Reach Inches', 'Takedown Total Landed', 'Significant Strike Body Attempted', 'Significant Strike Clinch Attempted', 'odds', 'Avg_Takedown Total Attempted_Losses_KO_TKO', 'Takedown Total Landed_Losses', 'Reversal_Wins_SUB', 'Avg_Ground and Cage Control Time_Seconds_Losses_U_DEC', 'Avg_Significant Strike Leg Landed_Wins_KO_TKO', 'Significant Strike Clinch Attempted_Wins_KO_TKO', 'Avg_Significant Strike Clinch Landed_Losses', 'Avg_Significant Strike Ground Landed_Losses', 'Avg_Winning Time_Seconds_Wins', 'Winning Time_Seconds_CNC'] | Accuracy: 0.6536
Subset: ['Reach Inches', 'Takedown Total Attempted', 'Takedown Total Landed', 'Significant Strike Head Landed', 'Significant Strike Body Landed', 'Significant Strike Ground Attempted', 'Significant Strike Ground Landed', 'odds', 'Ground and Cage Control Time_Secon

#Model Train

In [None]:
from sklearn.metrics import accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df[best_subset], df['Is_Winner'], test_size=0.2, random_state=42
)

# Train and evaluate
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
test_acc = model.score(X_test, y_test)
print(f"\nFinal Model Test Accuracy: {test_acc:.4f}")


# Feature importance
importance = pd.DataFrame({
    'Feature': best_subset,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nFeature Importances:\n", importance)


Final Model Test Accuracy: 0.6519

Feature Importances:
                                               Feature  Importance
4                                                odds    0.261108
0                                        Reach Inches    0.106935
2                   Significant Strike Body Attempted    0.105132
3                 Significant Strike Clinch Attempted    0.089186
13                      Avg_Winning Time_Seconds_Wins    0.079853
11        Avg_Significant Strike Clinch Landed_Losses    0.065269
12        Avg_Significant Strike Ground Landed_Losses    0.055264
8   Avg_Ground and Cage Control Time_Seconds_Losse...    0.048393
9       Avg_Significant Strike Leg Landed_Wins_KO_TKO    0.042571
1                               Takedown Total Landed    0.038973
10    Significant Strike Clinch Attempted_Wins_KO_TKO    0.037805
6                        Takedown Total Landed_Losses    0.036760
5          Avg_Takedown Total Attempted_Losses_KO_TKO    0.023793
7                 