In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
try:
    df = pd.read_csv('cricket_features.csv')

    # Display the first few rows
    print("First 5 rows of the data:")
    print(df.head())

    # Display DataFrame information (column types, non-null counts)
    print("\nDataFrame Info:")
    df.info()

    # Identify categorical and numerical features
    # 'win' is the target variable
    categorical_features = ['batting_team', 'bowling_team', 'venue', 'toss_winner', 'toss_decision']
    numerical_features = ['runs_required', 'balls_remaining', 'wickets_in_hand', 'target_match', 'current_run_rate', 'required_run_rate']
    target = 'win'

    # Define preprocessors
    # For numerical features: impute missing values (if any) with median and scale
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # For categorical features: impute missing values (if any) with most frequent and one-hot encode
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Create the ColumnTransformer to apply different transformers to different columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Define the model pipeline, including preprocessing and the classifier
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # Split the data into features (X) and target (y)
    X = df.drop(target, axis=1)
    y = df[target]

    # Split the data into training and testing sets
    # stratify=y ensures the distribution of the target variable is the same in both sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Train the model
    print("\nTraining the model...")
    model.fit(X_train, y_train)
    print("Model training complete.")

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"\nAccuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)

except FileNotFoundError:
    print("Error: The file 'cricket_features.csv' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

First 5 rows of the data:
                  batting_team         bowling_team  \
0  Royal Challengers Bangalore  Sunrisers Hyderabad   
1  Royal Challengers Bangalore  Sunrisers Hyderabad   
2  Royal Challengers Bangalore  Sunrisers Hyderabad   
3  Royal Challengers Bangalore  Sunrisers Hyderabad   
4  Royal Challengers Bangalore  Sunrisers Hyderabad   

                                       venue  runs_required  balls_remaining  \
0  Rajiv Gandhi International Stadium, Uppal            206              119   
1  Rajiv Gandhi International Stadium, Uppal            206              118   
2  Rajiv Gandhi International Stadium, Uppal            206              117   
3  Rajiv Gandhi International Stadium, Uppal            204              116   
4  Rajiv Gandhi International Stadium, Uppal            200              115   

   wickets_in_hand  target_match  current_run_rate  required_run_rate  \
0               10           207               6.0          10.386555   
1               

In [2]:
# --- Get the OneHotEncoder feature names ---
    
# 1. Access the 'preprocessor' step from the main pipeline
preprocessor_step = model.named_steps['preprocessor']

# 2. Access the 'cat' (categorical) transformer from the ColumnTransformer
categorical_transformer_pipeline = preprocessor_step.named_transformers_['cat']

# 3. Access the 'onehot' step from the categorical pipeline
one_hot_encoder = categorical_transformer_pipeline.named_steps['onehot']

# 4. Get the feature names
# The 'categorical_features' list must be in the same order as when the preprocessor was defined
categorical_features = ['batting_team', 'bowling_team', 'venue', 'toss_winner', 'toss_decision']
encoded_feature_names = one_hot_encoder.get_feature_names_out(categorical_features)

# Now you can filter and print them
team_encodings = [name for name in encoded_feature_names if 'batting_team_' in name or 'bowling_team_' in name]
venue_encodings = [name for name in encoded_feature_names if 'venue_' in name]

print("--- Team Encodings ---")
print(team_encodings)

print("\n--- Venue Encodings ---")
print(venue_encodings)

--- Team Encodings ---
['batting_team_Chennai Super Kings', 'batting_team_Deccan Chargers', 'batting_team_Delhi Capitals', 'batting_team_Delhi Daredevils', 'batting_team_Gujarat Lions', 'batting_team_Kings XI Punjab', 'batting_team_Kochi Tuskers Kerala', 'batting_team_Kolkata Knight Riders', 'batting_team_Mumbai Indians', 'batting_team_Pune Warriors', 'batting_team_Rajasthan Royals', 'batting_team_Rising Pune Supergiant', 'batting_team_Rising Pune Supergiants', 'batting_team_Royal Challengers Bangalore', 'batting_team_Sunrisers Hyderabad', 'bowling_team_Chennai Super Kings', 'bowling_team_Deccan Chargers', 'bowling_team_Delhi Capitals', 'bowling_team_Delhi Daredevils', 'bowling_team_Gujarat Lions', 'bowling_team_Kings XI Punjab', 'bowling_team_Kochi Tuskers Kerala', 'bowling_team_Kolkata Knight Riders', 'bowling_team_Mumbai Indians', 'bowling_team_Pune Warriors', 'bowling_team_Rajasthan Royals', 'bowling_team_Rising Pune Supergiant', 'bowling_team_Rising Pune Supergiants', 'bowling_tea