In [5]:
import pandas as pd
import numpy as np


long_fp = "M:/Data Science/Data Analyst/Github & LinkedIN Portfolio Projects/Football Academy Performance Project/football_performance_long1.csv"
summary_fp = "M:/Data Science/Data Analyst/Github & LinkedIN Portfolio Projects/Football Academy Performance Project/football_performance_summary.csv"

df_long = pd.read_csv(long_fp)
df_summary = pd.read_csv(summary_fp)

print("df_long columns:", df_long.columns.tolist())
print("df_summary columns:", df_summary.columns.tolist())

# Define the common column based on your dataframes
# Replace 'player_id' with the actual column name that exists in both dataframes
common_column = 'player_id'  # You need to specify the actual common column name

df = pd.merge(
    df_long,
    df_summary,
    on=common_column,
    how='outer',  # Use 'inner', 'left', 'right', or 'outer' based on your needs
    suffixes=('_long', '_summary')  # Add suffixes to distinguish duplicate column names
)

print("\nCombined DataFrame shape:", df.shape)
print("\nFirst 5 rows of combined DataFrame:")
print(df.head())

print("\nMissing values in combined DataFrame:")
print(df.isnull().sum())

df_long columns: ['match_id', 'match_date', 'player_id', 'player_name', 'age', 'position', 'attendance', 'minutes_played', 'fitness_score', 'stamina', 'speed', 'passing_accuracy', 'tackles', 'goals', 'assists', 'shots_on_target', 'saves', 'yellow_card', 'red_card']
df_summary columns: ['player_id', 'player_name', 'age', 'position', 'matches_played', 'total_minutes', 'avg_fitness', 'avg_stamina', 'avg_speed', 'avg_passing', 'total_goals', 'total_assists', 'total_saves', 'total_tackles', 'yellow_cards', 'red_cards']

Combined DataFrame shape: (960, 34)

First 5 rows of combined DataFrame:
  match_id  match_date player_id player_name_long  age_long position_long  \
0     M200  01-01-2025     P1000            Anvay        12           MID   
1     M201  08-01-2025     P1000            Anvay        12           MID   
2     M202  15-01-2025     P1000            Anvay        12           MID   
3     M203  22-01-2025     P1000            Anvay        12           MID   
4     M204  29-01-202

In [8]:

import re
import pandas as pd
from sqlalchemy import create_engine

bool_indicators = ['is_', 'has_', 'was_', 'did_']
for col in df.columns:
    if any(indicator in col for indicator in bool_indicators):
        try:
            df[col] = df[col].astype(bool)
        except:
            pass


if 'birth_date' in df.columns and pd.api.types.is_datetime64_dtype(df['birth_date']):
    df['age'] = (pd.Timestamp.now() - df['birth_date']).dt.days / 365.25
    df['age_group'] = pd.cut(df['age'], 
                            bins=[0, 12, 15, 18, 21, 25, 30, 100],
                            labels=['U12', 'U15', 'U18', 'U21', '21-25', '26-30', '30+'])


performance_cols = [col for col in numeric_cols if 'performance' in col or 'score' in col or 'rating' in col]
for col in performance_cols:
    if col in df.columns:
        # Min-max normalization to 0-10 scale
        min_val = df[col].min()
        max_val = df[col].max()
        if max_val > min_val:  # Avoid division by zero
            df[f'{col}_normalized'] = 10 * (df[col] - min_val) / (max_val - min_val)


if set(['goals', 'assists', 'passes_completed']).issubset(df.columns):
    df['performance_index'] = (
        df['goals'] * 3 + 
        df['assists'] * 2 + 
        df['passes_completed'] * 0.01
    )

print("\nCleaned DataFrame info:")
print(df.info())
print("\nSample of cleaned data:")
print(df.head())


db_username = 'your_username'
db_password = 'your_password'
db_host = 'localhost'
db_port = '5432'
db_name = 'football_academy_db'
table_name = 'football_performance'


connection_string = f'postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}'
engine = create_engine(connection_string)


try:
    df.to_sql(
        name=table_name,
        con=engine,
        if_exists='replace',  # Options: 'fail', 'replace', 'append'
        index=False,
        chunksize=1000  # Adjust based on your data size
    )
    print(f"\nSuccessfully exported data to PostgreSQL table: {table_name}")
except Exception as e:
    print(f"Error exporting to PostgreSQL: {e}")


Cleaned DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960 entries, 0 to 959
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   match_id                  960 non-null    object 
 1   match_date                960 non-null    object 
 2   player_id                 960 non-null    object 
 3   player_name_long          960 non-null    object 
 4   age_long                  960 non-null    int64  
 5   position_long             960 non-null    object 
 6   attendance                960 non-null    int64  
 7   minutes_played            960 non-null    int64  
 8   fitness_score             960 non-null    float64
 9   stamina                   960 non-null    float64
 10  speed                     960 non-null    float64
 11  passing_accuracy          960 non-null    float64
 12  tackles                   960 non-null    int64  
 13  goals                     960 non-null  

In [10]:
df.describe()

Unnamed: 0,age_long,attendance,minutes_played,fitness_score,stamina,speed,passing_accuracy,tackles,goals,assists,...,avg_stamina,avg_speed,avg_passing,total_goals,total_assists,total_saves,total_tackles,yellow_cards,red_cards,fitness_score_normalized
count,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0,...,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0
mean,14.791667,0.922917,64.276042,69.838958,60.136875,63.737917,66.7675,1.071875,0.033333,0.034375,...,60.136875,63.737917,66.7675,0.666667,0.6875,2.895833,21.4375,1.416667,0.020833,5.490155
std,1.937053,0.266863,25.100712,9.940482,10.64875,10.460888,12.764259,1.296518,0.185314,0.187919,...,2.621101,2.902787,3.732828,1.143205,1.102774,9.679407,13.874014,1.239485,0.142901,1.603304
min,12.0,0.0,0.0,35.8,28.8,32.4,23.3,0.0,0.0,0.0,...,52.45,57.325,58.055,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13.0,1.0,53.0,63.6,53.2,56.675,58.05,0.0,0.0,0.0,...,58.895,61.88,63.87625,0.0,0.0,0.0,11.75,0.0,0.0,4.483871
50%,15.0,1.0,69.5,70.05,59.5,63.9,67.1,1.0,0.0,0.0,...,60.0525,63.75,66.7925,0.0,0.0,0.0,17.0,1.0,0.0,5.524194
75%,16.0,1.0,86.0,76.4,67.1,70.625,75.3,2.0,0.0,0.0,...,61.92375,65.4075,69.7125,1.0,1.0,0.0,36.0,2.0,0.0,6.548387
max,18.0,1.0,90.0,97.8,96.9,94.4,100.0,7.0,2.0,2.0,...,64.915,69.265,73.39,4.0,4.0,39.0,45.0,5.0,1.0,10.0


In [14]:
from sqlalchemy import create_engine

# Step 1: Connect to PostgreSQL
# Replace placeholders with actual details
username = "postgres"      # default user
password = "987021" # the password set during installation
host = "localhost"         # if running locally
port = "5432"              # default PostgreSQL port
database = "Football_Academy"    # the database created in pgAdmin

engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}")

# Step 2: Load DataFrame into PostgreSQL
table_name = "FA_P&E"
df.to_sql(table_name, engine, if_exists="replace", index=False)

print(f"Data successfully loaded into table '{table_name}' in database '{database}'.")

Data successfully loaded into table 'FA_P&E' in database 'Football_Academy'.
