In [1]:
import pandas as pd

In [2]:
from google.colab import files
uploaded = files.upload()

Saving first_500000_rows.csv to first_500000_rows.csv


In [3]:
df = pd.read_csv('/content/first_500000_rows.csv')

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder # Import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import numpy as np

In [5]:
# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()

target_column = 'player_final_value'

# Remove the target column from the feature columns
if target_column in numerical_cols:
    numerical_cols.remove(target_column)
elif target_column in categorical_cols:
    categorical_cols.remove(target_column)
else:
    print(f"Warning: Target column '{target_column}' not found in the DataFrame.")


# 1. Handle Missing Values (replace with appropriate strategy)
for col in df.columns:
    if df[col].isnull().any():
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode()[0]) # Mode for categorical
        else:
            df[col] = df[col].fillna(df[col].mean()) # Mean for numerical

# 2. Remove Duplicates
df = df.drop_duplicates()

# 3. Outlier Handling (Optional -  consider IQR or Z-score)
# Example using IQR for numerical features:
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# 4. Preprocessing for Modeling
X = df.drop(target_column, axis=1)
y = df[target_column]

# Drop rows with null values in the target variable 'y'
df = df.dropna(subset=[target_column])
y = df[target_column]

#Redefine X after dropping rows with null values in y
X = df.drop(target_column, axis=1)


# Create transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # Handle unseen categories in test data

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the data
X = preprocessor.fit_transform(X)


# Now, X is ready for splitting and modeling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Shapes after splitting:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Shapes after splitting:
X_train: (377408, 27277)
X_test: (94353, 27277)
y_train: (377408,)
y_test: (94353,)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
# Remove the target column from the feature columns
if target_column in numerical_cols:
    numerical_cols.remove(target_column)
elif target_column in categorical_cols:
    categorical_cols.remove(target_column)
else:
    print(f"Warning: Target column '{target_column}' not found in the DataFrame.")

# Create a copy of the DataFrame for cleaning
df_cleaned = df.copy()

# 1. Handle Missing Values
for col in df_cleaned.columns:
    if df_cleaned[col].isnull().any():
        if df_cleaned[col].dtype == 'object':
            df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0])
        else:
            df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mean())

# 2. Remove Duplicates
df_cleaned = df_cleaned.drop_duplicates()

# 3. Outlier Handling (using IQR)
for col in numerical_cols:
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]

# 4. Clean the target column *before* splitting the data
# Handle missing values in the target column
if df_cleaned[target_column].isnull().any():
    if df_cleaned[target_column].dtype == 'object':
        df_cleaned[target_column] = df_cleaned[target_column].fillna(df_cleaned[target_column].mode()[0])
    else:
        df_cleaned[target_column] = df_cleaned[target_column].fillna(df_cleaned[target_column].mean())


# 5. Preprocessing for Modeling
X = df_cleaned.drop(target_column, axis=1)
y = df_cleaned[target_column]

# Ensure that no null values remain in y and align X and y *before* preprocessing
y.dropna(inplace=True)
X = X.loc[y.index]  # Align X and y using .loc for DataFrame indexing

# Create transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the data
X = preprocessor.fit_transform(X)  # Now X is transformed after alignment

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shapes after splitting:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Shapes after splitting:
X_train: (309792, 24803)
X_test: (77448, 24803)
y_train: (309792,)
y_test: (77448,)


In [8]:
target_column = 'player_final_value'

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()

# Remove target from feature lists
if target_column in numerical_cols:
    numerical_cols.remove(target_column)
elif target_column in categorical_cols:
    categorical_cols.remove(target_column)

# Handle missing values
for col in df.columns:
    if df[col].isnull().any():
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].mean())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])

# Remove duplicates
df.drop_duplicates(inplace=True)

# Outlier handling (example using IQR)
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# Handle missing values in the target variable *before* splitting
if df[target_column].isnull().any():
    if pd.api.types.is_numeric_dtype(df[target_column]):
        df[target_column] = df[target_column].fillna(df[target_column].mean())
    else:
        df[target_column] = df[target_column].fillna(df[target_column].mode()[0])

# Define X and y AFTER handling missing values in the target variable
X = df.drop(target_column, axis=1)
y = df[target_column]


# Ensure no null values in y and align X and y
y.dropna(inplace=True)
X = X.loc[y.index]

#Preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)


X = preprocessor.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shapes after splitting:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Shapes after splitting:
X_train: (309792, 24803)
X_test: (77448, 24803)
y_train: (309792,)
y_test: (77448,)


In [9]:
# Map card values
def map_card_values(df, column_name):
    """Maps card values to their numerical equivalents.

    Args:
        df: The Pandas DataFrame containing the card data.
        column_name: The name of the column containing the card values.

    Returns:
        The DataFrame with the mapped card values.
    """
    value_mapping = {
        'A': 11,
        'J': 10,
        'Q': 10,
        'K': 10
    }

    # Convert existing numerical values to strings
    # so they can be consistently mapped
    df[column_name] = df[column_name].astype(str)

    df[column_name] = df[column_name].map(value_mapping).fillna(pd.to_numeric(df[column_name], errors='coerce'))

    # Convert any values that remain as strings to NaN
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')


    return df

In [10]:
# Before fitting the model, ensure y_train is numeric
# Check if y is already numeric (float or int)
if not pd.api.types.is_numeric_dtype(y):
    # If not numeric, extract numerical target values from list-like strings
    y = y.str.extract(r'\[(\d+)\]', expand=False).astype(float)
else:
    print("y is already numeric. Skipping string extraction.")

# Ensure no null values in y and align X and y *BEFORE* preprocessing
y.dropna(inplace=True)

# Assuming 'df' is your original DataFrame before preprocessing
X = df.drop(target_column, axis=1)  # Reset X to the original DataFrame
X = X.loc[y.index]  # Align X and y using .loc on the DataFrame

# Now proceed with preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

X = preprocessor.fit_transform(X)  # Now X is transformed after alignment

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 1. Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)

mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("Linear Regression:")
print(f"Mean Squared Error: {mse_linear}")
print(f"R-squared: {r2_linear}")

Linear Regression:
Mean Squared Error: 0.03160021873596134
R-squared: 0.9971478548857413


In [11]:
# 2. Decision Tree Regression
tree_reg = DecisionTreeRegressor(random_state=42) # Add random_state for reproducibility
tree_reg.fit(X_train, y_train)
y_pred_tree = tree_reg.predict(X_test)

# Before calculating the error, handle NaN values in y_test and y_pred_tree
y_test_no_nan = y_test.dropna()  # Remove rows with NaN in y_test
y_pred_tree_no_nan = y_pred_tree[y_test.notna()]  # Keep predictions corresponding to non-NaN y_test values

mse_tree = mean_squared_error(y_test_no_nan, y_pred_tree_no_nan)  # Calculate MSE using non-NaN values
r2_tree = r2_score(y_test_no_nan, y_pred_tree_no_nan)  # Calculate R-squared using non-NaN values

print("\nDecision Tree Regression:")
print(f"Mean Squared Error: {mse_tree}")
print(f"R-squared: {r2_tree}")


Decision Tree Regression:
Mean Squared Error: 0.1947424818046452
R-squared: 0.9824231021101876


In [12]:
# 3. Random Forest Regression
forest_reg = RandomForestRegressor(random_state=42, n_estimators=100) # Increased n_estimators
forest_reg.fit(X_train, y_train)
y_pred_forest = forest_reg.predict(X_test)

# Before calculating the error, handle NaN values in y_test
y_test_no_nan = y_test.dropna()  # Remove rows with NaN in y_test
y_pred_forest_no_nan = y_pred_forest[y_test.notna()]  # Keep predictions corresponding to non-NaN y_test values

# Calculate MSE and R-squared using non-NaN values
mse_forest = mean_squared_error(y_test_no_nan, y_pred_forest_no_nan)
r2_forest = r2_score(y_test_no_nan, y_pred_forest_no_nan)

print("\nRandom Forest Regression:")
print(f"Mean Squared Error: {mse_forest}")
print(f"R-squared: {r2_forest}")


Random Forest Regression:
Mean Squared Error: 0.13207860810754396
R-squared: 0.9880789636312446


I feel the Linear Regression model is the best model to use based on the lower Mean Squared Error.