In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from scipy.optimize import linprog

# Step 1: Load Data
# Correct file paths to avoid FileNotFoundError

official_data_path = r'C:\Users\Rufeeh\OneDrive\Desktop\Africa-Data-School-Curriculum\players.csv'  
official_data = pd.read_csv(official_data_path)


# Step 2: Data Cleaning and Exploration
def clean_data(data):
    # Drop duplicates and handle missing values
    data = data.drop_duplicates()
    data = data.fillna(0)
    return data

# Step 3: Feature Engineering
def feature_engineering(data):
    # Create features such as points per minute, fixture difficulty, etc.
    data['points_per_minute'] = data['total_points'] / data['minutes']
    data['form_diff'] = data['form'] - data['recent_form']
    data['adjusted_difficulty'] = data['fixture_difficulty'] - data['home_advantage']
    return data

#features_data = feature_engineering(cleaned_data)

# Step 4: Train-Test Split
X = features_data[['points_per_minute', 'form_diff', 'adjusted_difficulty', 'minutes', 'goals_scored', 'assists']]
y = features_data['total_points']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model Training
model = RandomForestRegressor(random_state=42)
param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20, None]}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Step 6: Evaluate Model
predictions = best_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

# Step 7: Optimization (Team Selection)
# Budget and Constraints
budget = 100  # in millions
num_players = 15
positions = {'GK': 2, 'DEF': 5, 'MID': 5, 'FWD': 3}

# Variables
costs = features_data['price'].values
predicted_points = best_model.predict(X)
positions_data = pd.get_dummies(features_data['position']).values

# Linear Programming
c = -predicted_points  # Maximize points
A = [costs, *positions_data.T]
b = [budget] + [positions[pos] for pos in positions]
bounds = [(0, 1) for _ in range(len(costs))]  # Player selection as binary

result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs')

# Step 8: Novel Idea: Gameweek-Specific Risk Adjustment
def risk_adjustment(predictions, data):
    """
    Adjust predictions based on form variability and fixture risk.
    """
    data['risk_factor'] = np.abs(data['form_diff']) / (data['fixture_difficulty'] + 1)
    adjusted_predictions = predictions * (1 - data['risk_factor'])
    return adjusted_predictions

adjusted_points = risk_adjustment(predicted_points, features_data)

# Final Selected Team
selected_players = features_data.iloc[np.where(result.x > 0.5)]
print("Selected Team:")
print(selected_players[['name', 'position', 'price', 'predicted_points']])


NameError: name 'features_data' is not defined