## Import Libraries

In [None]:
# To ignore warnings
#!pip install scikit-learn

#import warnings
#warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics


#%matplotlib inline

## Load the Data

In [None]:
df = pd.read_csv("cleanfootball.csv")

In [None]:
df.shape

In [None]:
sns.pairplot(df)

In [None]:
df.head(3)

## Preprocess the Data

#### 1. Simple EDA + Data Quality checking

In [None]:
# uniqeness
df.drop_duplicates(inplace=True)
df[df.duplicated()]

In [None]:
# Completeness
df.isnull().sum()

In [None]:
# Accurecy types
df.dtypes

In [None]:
# Define numeric features
numeric_features = [
    'height', 'age', 'appearance', 'goals', 'assists',
    'yellow cards', 'second yellow cards', 'red cards', 
    'goals conceded', 'clean sheets', 'minutes played',
    'days_injured', 'games_injured', 'award', 
    'current_value', 'highest_value'
]

# Define categorical features
categorical_features = [
    'name', 'team', 'position'
]


In [None]:
# Loop through each feature in the numeric_features list
for feature in numeric_features:
    # Create a histogram for the current feature
    plt.hist(df[feature], bins=30, edgecolor='k', alpha=0.7)
    
    # Set the title of the histogram to the feature name
    plt.title(f'Histogram of {feature}')
    
    # Label the x-axis
    plt.xlabel(feature)
    
    # Label the y-axis
    plt.ylabel('Frequency')
    
    # Display the histogram
    plt.show()


#### 2. Feature engineering

1. Feature scaling
2. Aggregation
3. One hot coding

In [None]:
categorical_features

In [None]:
df = pd.get_dummies(df, columns=categorical_features)

#### 3. Feature selection

In [None]:
# sns.heatmap(df.corr(), annot=True);

In [None]:
df.drop(columns = ['Unnamed: 0'], inplace = True)

In [None]:
correlation = df.corr()
print(correlation['current_value'].sort_values(ascending=False))

In [None]:
selected_features = ['age', 'appearance', 'goals', 'assists', 'minutes played',
       'days_injured', 'games_injured', 'award', 'current_value',
       'highest_value', 'team_AC Milan', 'team_Arsenal FC', 'team_Aston Villa',
       'team_Atalanta BC', 'team_Atlético de Madrid', 'team_Borussia Dortmund',
       'team_Brentford FC', 'team_Brighton &amp; Hove Albion',
       'team_Chelsea FC', 'team_Everton FC', 'team_Leeds United',
       'team_Leicester City', 'team_Newcastle United', 'team_SSC Napoli',
       'team_Southampton FC', 'team_Tottenham Hotspur',
       'team_Wolverhampton Wanderers']

In [None]:
df = df[selected_features]
df.head()

#### 4. Prepare train and test data

In [None]:
# Prepare data
X = df.drop(['current_value'], axis=1)
y = df['current_value']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=42)

# sacle the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X.shape

## Buliding the Model

In [None]:
model = LinearRegression()

## Train the Model

In [None]:
# Fit the model on the training data
model.fit(X_train_scaled, y_train)

## Test the Model

In [None]:
# Predict and evaluate the model
y_pred = model.predict(X_test_scaled)

## Interpretation of the Model

In [None]:
model.coef_

In [None]:
coeff_df = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
coeff_df

In [None]:
print(model.intercept_) 

In [None]:
coeff_df = pd.DataFrame(lasso_model.best_estimator_.coef_,X.columns,columns=['Coefficient'])
coeff_df

In [None]:
param_grid ={
    'alpha': [0.001, 0.01, 1, 11, 22, 33, 44]
}
lasso_model = GridSearchCV(Lasso(), param_grid)


In [None]:
ridge_model = GridSearchCV(Ridge(), param_grid)

In [None]:
lasso_model.fit(X_train_scaled, y_train)

In [None]:
ridge_model.fit(X_train_scaled, y_train)

In [None]:
y_test =lasso_model.predict(X_test_scaled)

## Evaluating the Model 

1. Mean Square Error

In [None]:
y_base = [y_train.mean()] * len(y_test)

In [None]:
# our benchmark model
mse_base = mean_squared_error(y_test, y_base )
print(f'Mean Squared Error of base model: {mse_base}')

In [None]:
mse_train = mean_squared_error(y_train, model.predict(X_train_scaled))
mse_test = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error for training: {mse_train}')
print(f'Mean Squared Error for testing: {mse_test}')

2. Mean Absolute Error

In [None]:
# our benchmark model
mse_base = mean_absolute_error(y_test, y_base )
print(f'Mean Absolute  Error of base model: {mse_base}')

In [None]:
mae_train = mean_absolute_error(y_train, model.predict(X_train_scaled))
mae_test = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error for training: {mae_train}')
print(f'Mean Absolute Error for testing: {mae_test}')

3. Root Mean Squared Error

In [None]:
# our benchmark model
print(f'Root Mean Squared  Error of base model: {np.sqrt(mse_base)}')

In [None]:
print(f'Root Mean Squared Error for training: {np.sqrt(mse_train)}')
print(f'Root Mean Squared Error for testing: {np.sqrt(mse_test)}')

4. R Square

In [None]:
r2_score_train = r2_score(y_train, model.predict(X_train_scaled))
r2_score_test = r2_score(y_test, y_pred)
print(f'R Square for training: {r2_score_train}')
print(f'R Square for testing: {r2_score_test}')

#### Predictions vs. Real Values

In [None]:
predictions = model.predict(X_test_scaled)
#predictions

In [None]:
Real_Values = np.array(y_test)
#Real_Values

In [None]:
plt.scatter(Real_Values,predictions); 

#### Residual Histogram

In [None]:
sns.distplot((y_test-predictions),bins=50); # it's good if we get normal distribution