In [None]:
# Check for missing values
missing_values = baseball_df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Handle missing values (for example, by imputation or removal)
# For imputation, you can use methods like fillna()
# For removal, you can use methods like dropna()

# Example of imputation (filling missing values with the mean)
baseball_df.fillna(baseball_df.mean(), inplace=True)

# Example of removal (removing rows with missing values)
# baseball_df.dropna(inplace=True)

# Convert data types if needed
# Check the data types of each column using baseball_df.dtypes
# Convert columns using astype() or other appropriate methods

# Example of converting a column to a different data type (e.g., float to int)
# baseball_df['Column_Name'] = baseball_df['Column_Name'].astype(int)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style for seaborn
sns.set(style="whitegrid")

# Pairplot to visualize relationships between numerical features
sns.pairplot(baseball_df[['W', 'R', 'AB', 'H', '2B', '3B', 'HR', 'BB', 'SO', 'SB', 'RA', 'ER', 'ERA', 'CG', 'SHO', 'SV', 'E']])
plt.suptitle("Pairplot of Numerical Features", y=1.02)
plt.show()

# Correlation heatmap to show the correlation between numerical features
correlation_matrix = baseball_df.corr()
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title("Correlation Heatmap")
plt.show()

# Scatter plot to visualize the relationship between a specific feature and the target variable
plt.figure(figsize=(10, 6))
sns.scatterplot(x='R', y='W', data=baseball_df)
plt.title("Runs Scored vs. Number of Wins")
plt.xlabel("Runs Scored")
plt.ylabel("Number of Wins")
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Select numerical features for standardization or normalization
numerical_features = ['W', 'R', 'AB', 'H', '2B', '3B', 'HR', 'BB', 'SO', 'SB', 'RA', 'ER', 'ERA', 'CG', 'SHO', 'SV', 'E']

# Initialize StandardScaler and MinMaxScaler
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

# Standardization
baseball_df_standardized = baseball_df.copy()
baseball_df_standardized[numerical_features] = scaler_standard.fit_transform(baseball_df[numerical_features])

# Normalization (Min-Max Scaling)
baseball_df_normalized = baseball_df.copy()
baseball_df_normalized[numerical_features] = scaler_minmax.fit_transform(baseball_df[numerical_features])

# Display the first few rows of the standardized and normalized DataFrames
print("Standardized DataFrame:")
print(baseball_df_standardized.head())

print("\nNormalized DataFrame:")
print(baseball_df_normalized.head())


In [None]:
from sklearn.model_selection import train_test_split

# Define the features (X) and the target variable (y)
features = baseball_df.drop('W', axis=1)  # Exclude the target variable 'W'
target = baseball_df['W']

# Split the dataset into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Display the shape of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model performance (e.g., using Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Evaluate the model performance on the training set (e.g., using Mean Squared Error)
mse_train = mean_squared_error(y_train, y_train_pred)
print("Training Set Mean Squared Error:", mse_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the testing set
y_test_pred = model.predict(X_test)

# Evaluate the model performance on the testing set (e.g., using Mean Squared Error)
mse_test = mean_squared_error(y_test, y_test_pred)
print("Testing Set Mean Squared Error:", mse_test)

# Evaluate the model performance using R-squared
r2 = r2_score(y_test, y_test_pred)
print("R-squared:", r2)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
rf_model = RandomForestRegressor()

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with the model and hyperparameter grid
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions on the testing set using the best model
y_test_pred_tuned = best_model.predict(X_test)

# Evaluate the performance of the tuned model
mse_test_tuned = mean_squared_error(y_test, y_test_pred_tuned)
r2_tuned = r2_score(y_test, y_test_pred_tuned)

print("\nTesting Set Mean Squared Error (Tuned Model):", mse_test_tuned)
print("R-squared (Tuned Model):", r2_tuned)
